From 0c7ec747daed8f0718d6984a437895ba261db8d1 Mon Sep 17 00:00:00 2001 From: larryro <371767072@qq.com> Date: Fri, 27 Feb 2026 19:39:39 +0800 Subject: [PATCH 1/9] feat: migrate website search and embeddings from Convex to crawler service Move search indexing and embedding storage from Convex to the crawler service backed by PostgreSQL + pgvector. Make all DB init scripts idempotent and run them on every container startup. Drop legacy TimescaleDB extension, ensure crawler upserts website_urls before inserting chunks, and add transactional cascading deletes when removing a website. --- compose.yml | 6 +- services/crawler/app/config.py | 16 + services/crawler/app/main.py | 47 +- services/crawler/app/models.py | 103 +++ services/crawler/app/routers/__init__.py | 9 + services/crawler/app/routers/crawler.py | 9 +- services/crawler/app/routers/index.py | 70 ++ services/crawler/app/routers/pages.py | 93 +++ services/crawler/app/routers/search.py | 78 ++ services/crawler/app/routers/websites.py | 72 +- .../crawler/app/services/chunking_service.py | 159 ++++ services/crawler/app/services/database.py | 57 ++ .../crawler/app/services/embedding_service.py | 79 ++ .../crawler/app/services/indexing_service.py | 133 ++++ .../crawler/app/services/pg_website_store.py | 340 +++++++++ services/crawler/app/services/scheduler.py | 174 +++-- .../crawler/app/services/search_service.py | 130 ++++ .../crawler/app/services/website_store.py | 451 ----------- services/crawler/pyproject.toml | 2 + .../crawler/tests/test_chunking_service.py | 251 +++++++ .../crawler/tests/test_embedding_service.py | 163 ++++ services/crawler/tests/test_index_router.py | 134 ++++ .../crawler/tests/test_indexing_service.py | 166 +++++ services/crawler/tests/test_pages_router.py | 204 +++++ services/crawler/tests/test_search_router.py | 129 ++++ services/crawler/tests/test_search_service.py | 174 +++++ services/crawler/tests/test_website_store.py | 701 ------------------ .../crawler/tests/test_websites_router.py | 254 +++++++ services/crawler/uv.lock | 52 ++ services/db/Dockerfile | 36 +- services/db/docker-entrypoint-wrapper.sh | 38 +- .../db/init-scripts/01-init-extensions.sql | 24 + .../db/init-scripts/01-init-timescaledb.sql | 85 --- .../02-create-convex-database.sql | 21 +- .../init-scripts/03-create-rag-database.sql | 58 +- .../04-create-search-database.sql | 111 +++ services/db/postgresql.conf | 22 +- .../features/automations/utils/step-icons.tsx | 2 - .../components/website-pages-dialog.tsx | 154 ++-- .../websites/components/websites-table.tsx | 16 +- .../app/features/websites/hooks/mutations.ts | 17 +- .../app/features/websites/hooks/queries.ts | 17 - services/platform/convex/_generated/api.d.ts | 125 +--- .../database/helpers/schema_definitions.ts | 13 - .../web/helpers/query_web_context.ts | 42 +- .../agent_tools/web/helpers/search_pages.ts | 46 +- services/platform/convex/convex.config.ts | 2 - .../convex/lib/embedding_config.test.ts | 162 ---- .../platform/convex/lib/embedding_config.ts | 104 --- .../convex/lib/rls/helpers/rls_rules.ts | 12 - .../convex/predefined_workflows/index.ts | 2 - .../predefined_workflows/website_scan.ts | 190 ----- services/platform/convex/schema.ts | 19 +- .../chunk_content.test.ts | 92 --- .../website_page_embeddings/chunk_content.ts | 128 ---- .../content_hash.test.ts | 38 - .../website_page_embeddings/content_hash.ts | 14 - .../website_page_embeddings/embedding_pool.ts | 18 - .../internal_actions.ts | 414 ----------- .../internal_mutations.ts | 194 ----- .../internal_queries.ts | 189 ----- .../website_page_embeddings/rrf.test.ts | 69 -- .../convex/website_page_embeddings/rrf.ts | 40 - .../convex/website_page_embeddings/schema.ts | 46 -- services/platform/convex/websites/actions.ts | 217 ++++++ .../convex/websites/bulk_create_websites.ts | 6 +- .../convex/websites/bulk_upsert_pages.ts | 162 ---- .../convex/websites/cleanup_website.test.ts | 138 ---- .../convex/websites/cleanup_website.ts | 50 -- .../convex/websites/create_website.ts | 79 +- .../convex/websites/delete_website.ts | 46 +- .../convex/websites/get_page_by_url.ts | 30 - .../convex/websites/get_pages_by_website.ts | 36 - .../convex/websites/get_website_by_domain.ts | 4 +- services/platform/convex/websites/helpers.ts | 6 - .../convex/websites/internal_actions.ts | 180 ++++- .../convex/websites/internal_mutations.ts | 78 +- .../convex/websites/internal_queries.ts | 41 +- .../list_website_pages_paginated.test.ts | 91 --- .../websites/list_website_pages_paginated.ts | 28 - .../platform/convex/websites/mutations.ts | 82 +- .../provision_website_scan_workflow.ts | 205 ----- services/platform/convex/websites/queries.ts | 11 - .../platform/convex/websites/register_urls.ts | 119 --- .../convex/websites/rescan_website.ts | 55 +- services/platform/convex/websites/schema.ts | 26 +- services/platform/convex/websites/types.ts | 64 +- .../convex/websites/update_website.ts | 6 +- .../platform/convex/websites/validators.ts | 16 - .../action_defs/action_registry.ts | 2 - .../action_defs/website/helpers/types.ts | 4 +- .../action_defs/website/website_action.ts | 7 +- .../website_pages/helpers/types.ts | 47 -- .../website_pages/website_pages_action.ts | 339 --------- .../helpers/types.ts | 3 +- .../workflow_processing_records_action.ts | 1 - .../validation/variables/action_schemas.ts | 56 +- .../workflow_syntax_compact.ts | 9 - .../processing_records/get_table_indexes.ts | 17 - .../processing_records/internal_mutations.ts | 1 - .../workflows/processing_records/types.ts | 3 +- .../platform/lib/shared/schemas/websites.ts | 19 +- services/platform/messages/en.json | 9 +- 103 files changed, 3934 insertions(+), 5175 deletions(-) create mode 100644 services/crawler/app/routers/index.py create mode 100644 services/crawler/app/routers/pages.py create mode 100644 services/crawler/app/routers/search.py create mode 100644 services/crawler/app/services/chunking_service.py create mode 100644 services/crawler/app/services/database.py create mode 100644 services/crawler/app/services/embedding_service.py create mode 100644 services/crawler/app/services/indexing_service.py create mode 100644 services/crawler/app/services/pg_website_store.py create mode 100644 services/crawler/app/services/search_service.py delete mode 100644 services/crawler/app/services/website_store.py create mode 100644 services/crawler/tests/test_chunking_service.py create mode 100644 services/crawler/tests/test_embedding_service.py create mode 100644 services/crawler/tests/test_index_router.py create mode 100644 services/crawler/tests/test_indexing_service.py create mode 100644 services/crawler/tests/test_pages_router.py create mode 100644 services/crawler/tests/test_search_router.py create mode 100644 services/crawler/tests/test_search_service.py delete mode 100644 services/crawler/tests/test_website_store.py create mode 100644 services/crawler/tests/test_websites_router.py create mode 100644 services/db/init-scripts/01-init-extensions.sql delete mode 100644 services/db/init-scripts/01-init-timescaledb.sql create mode 100644 services/db/init-scripts/04-create-search-database.sql delete mode 100644 services/platform/convex/lib/embedding_config.test.ts delete mode 100644 services/platform/convex/lib/embedding_config.ts delete mode 100644 services/platform/convex/predefined_workflows/website_scan.ts delete mode 100644 services/platform/convex/website_page_embeddings/chunk_content.test.ts delete mode 100644 services/platform/convex/website_page_embeddings/chunk_content.ts delete mode 100644 services/platform/convex/website_page_embeddings/content_hash.test.ts delete mode 100644 services/platform/convex/website_page_embeddings/content_hash.ts delete mode 100644 services/platform/convex/website_page_embeddings/embedding_pool.ts delete mode 100644 services/platform/convex/website_page_embeddings/internal_actions.ts delete mode 100644 services/platform/convex/website_page_embeddings/internal_mutations.ts delete mode 100644 services/platform/convex/website_page_embeddings/internal_queries.ts delete mode 100644 services/platform/convex/website_page_embeddings/rrf.test.ts delete mode 100644 services/platform/convex/website_page_embeddings/rrf.ts delete mode 100644 services/platform/convex/website_page_embeddings/schema.ts create mode 100644 services/platform/convex/websites/actions.ts delete mode 100644 services/platform/convex/websites/bulk_upsert_pages.ts delete mode 100644 services/platform/convex/websites/cleanup_website.test.ts delete mode 100644 services/platform/convex/websites/cleanup_website.ts delete mode 100644 services/platform/convex/websites/get_page_by_url.ts delete mode 100644 services/platform/convex/websites/get_pages_by_website.ts delete mode 100644 services/platform/convex/websites/list_website_pages_paginated.test.ts delete mode 100644 services/platform/convex/websites/list_website_pages_paginated.ts delete mode 100644 services/platform/convex/websites/provision_website_scan_workflow.ts delete mode 100644 services/platform/convex/websites/register_urls.ts delete mode 100644 services/platform/convex/workflow_engine/action_defs/website_pages/helpers/types.ts delete mode 100644 services/platform/convex/workflow_engine/action_defs/website_pages/website_pages_action.ts diff --git a/compose.yml b/compose.yml index 27004d355e..9061467772 100644 --- a/compose.yml +++ b/compose.yml @@ -26,7 +26,7 @@ services: # ============================================================================ - # Tale DB (TimescaleDB) + # Tale DB (ParadeDB — pg_search + pgvector) # ============================================================================ db: # Image from GHCR (used when PULL_POLICY=always) @@ -148,6 +148,10 @@ services: # cpus: '1' # memory: 2G + # Dependencies + depends_on: + - db + # Volume mounts # Persist crawler data (website registry + per-site URL databases) volumes: diff --git a/services/crawler/app/config.py b/services/crawler/app/config.py index 269ce48654..b7a77b6499 100644 --- a/services/crawler/app/config.py +++ b/services/crawler/app/config.py @@ -43,6 +43,13 @@ class Settings(BaseSettings): # Concurrency for Vision processing vision_max_concurrent_pages: int = 3 + # Database configuration + database_url: str | None = None + + # Embedding model configuration + openai_embedding_model: str | None = None + embedding_dimensions: int = 1536 + model_config = SettingsConfigDict( env_prefix="CRAWLER_", env_file=".env", @@ -76,6 +83,15 @@ def get_fast_model(self) -> str: raise ValueError("OPENAI_FAST_MODEL must be set in environment.") return model + def get_embedding_model(self) -> str: + """Get embedding model from CRAWLER_OPENAI_EMBEDDING_MODEL or OPENAI_EMBEDDING_MODEL.""" + model = get_first_model(self.openai_embedding_model) or get_first_model( + os.environ.get("OPENAI_EMBEDDING_MODEL") + ) + if not model: + raise ValueError("OPENAI_EMBEDDING_MODEL must be set in environment.") + return model + # Global settings instance settings = Settings() diff --git a/services/crawler/app/main.py b/services/crawler/app/main.py index e1c33e0b87..221b7c40c3 100644 --- a/services/crawler/app/main.py +++ b/services/crawler/app/main.py @@ -3,7 +3,7 @@ Independent web crawling service using Crawl4AI. Provides REST API for website crawling, URL discovery, document conversion, -template generation, and file parsing. +template generation, file parsing, content indexing, and hybrid search. This module follows Clean Architecture principles: - main.py: Application setup, configuration, and router registration @@ -27,16 +27,17 @@ crawler_router, docx_router, image_router, + index_router, + pages_router, pdf_router, pptx_router, + search_router, web_router, websites_router, ) from app.services.crawler_service import get_crawler_service from app.services.image_service import get_image_service from app.services.pdf_service import get_pdf_service -from app.services.scheduler import run_scheduler -from app.services.website_store import get_website_store_manager @asynccontextmanager @@ -54,11 +55,36 @@ async def lifespan(app: FastAPI) -> AsyncGenerator: logger.info("Crawler service initialized successfully") except Exception: logger.exception("Failed to initialize crawler service") - # Don't fail startup - allow lazy initialization + + # Initialize PostgreSQL connection pool + search services + from app.services.database import close_pool, init_pool + from app.services.embedding_service import get_embedding_service + from app.services.indexing_service import IndexingService + from app.services.pg_website_store import PgWebsiteStoreManager + from app.services.scheduler import run_scheduler + from app.services.search_service import SearchService + + pool = await init_pool() + pg_store_manager = PgWebsiteStoreManager(pool) + embedding_service = get_embedding_service() + indexing_service = IndexingService(pool, embedding_service) + search_service = SearchService(pool, embedding_service) + + # Wire services into routers + from app.routers.index import set_indexing_service + from app.routers.search import set_search_service + + set_search_service(search_service) + set_indexing_service(indexing_service) + + # Store references for scheduler and other routers + app.state.pg_store_manager = pg_store_manager + app.state.indexing_service = indexing_service + + logger.info("PostgreSQL pool + search services initialized") # Start background scheduler - store_manager = get_website_store_manager() - scheduler_task = asyncio.create_task(run_scheduler(store_manager, get_crawler_service())) + scheduler_task = asyncio.create_task(run_scheduler(pg_store_manager, get_crawler_service(), indexing_service)) logger.info("Background scheduler started") yield @@ -66,16 +92,14 @@ async def lifespan(app: FastAPI) -> AsyncGenerator: # Shutdown logger.info("Shutting down Tale Crawler service...") - # Stop scheduler scheduler_task.cancel() with suppress(asyncio.CancelledError): await scheduler_task logger.info("Scheduler stopped") - # Close all website stores - store_manager.close_all() + await pg_store_manager.close() + await close_pool() - # Cleanup crawler service try: crawler = get_crawler_service() if crawler.initialized: @@ -109,6 +133,9 @@ async def lifespan(app: FastAPI) -> AsyncGenerator: # Register routers app.include_router(crawler_router) app.include_router(websites_router) +app.include_router(search_router) +app.include_router(pages_router) +app.include_router(index_router) app.include_router(pdf_router) app.include_router(image_router) app.include_router(docx_router) diff --git a/services/crawler/app/models.py b/services/crawler/app/models.py index d12af2f5d2..bb9f7cad50 100644 --- a/services/crawler/app/models.py +++ b/services/crawler/app/models.py @@ -28,6 +28,21 @@ class RegisterWebsiteRequest(BaseModel): scan_interval: int = Field(21600, description="Scan interval in seconds (default: 6h)", ge=60) +class WebsiteInfoResponse(BaseModel): + """Full website information.""" + + domain: str + title: str | None = None + description: str | None = None + page_count: int = 0 + status: str = "idle" + scan_interval: int = 21600 + last_scanned_at: str | None = None + error: str | None = None + created_at: str | None = None + updated_at: str | None = None + + class WebsiteUrl(BaseModel): """A tracked URL with content hash.""" @@ -284,3 +299,91 @@ class WebFetchExtractResponse(BaseModel): page_count: int = Field(..., description="Number of pages in PDF") vision_used: bool = Field(False, description="Whether Vision API was used for extraction") error: str | None = Field(None, description="Error message if operation failed") + + +# ==================== Search Models ==================== + + +class SearchRequest(BaseModel): + """Request for hybrid search.""" + + query: str = Field(..., description="Search query") + limit: int = Field(10, ge=1, le=100, description="Maximum results") + + +class SearchResultItem(BaseModel): + """A single search result.""" + + url: str + title: str | None = None + chunk_content: str + chunk_index: int + score: float + + +class SearchResponse(BaseModel): + """Response from search endpoint.""" + + query: str + results: list[SearchResultItem] = Field(default_factory=list) + total: int + + +# ==================== Pages List Models ==================== + + +class PageListItem(BaseModel): + """A page in the pages list.""" + + url: str + title: str | None = None + word_count: int = 0 + status: str = "discovered" + content_hash: str | None = None + last_crawled_at: str | None = None + discovered_at: str | None = None + chunks_count: int = 0 + indexed: bool = False + + +class PageListResponse(BaseModel): + """Paginated response of pages for a website.""" + + domain: str + pages: list[PageListItem] = Field(default_factory=list) + total: int = 0 + offset: int = 0 + has_more: bool = False + + +# ==================== Indexing Models ==================== + + +class IndexPageRequest(BaseModel): + """Request to index a single page.""" + + domain: str = Field(..., description="Website domain") + url: str = Field(..., description="Page URL") + title: str | None = Field(None, description="Page title") + content: str = Field(..., description="Page content to index") + + +class IndexPageResponse(BaseModel): + """Response from indexing a single page.""" + + success: bool + url: str + chunks_indexed: int + status: str + error: str | None = None + + +class IndexWebsiteResponse(BaseModel): + """Response from indexing all pages for a website.""" + + success: bool + domain: str + pages_indexed: int + pages_skipped: int + pages_failed: int + total_chunks: int diff --git a/services/crawler/app/routers/__init__.py b/services/crawler/app/routers/__init__.py index 32a17be9a0..4cbfb3ef71 100644 --- a/services/crawler/app/routers/__init__.py +++ b/services/crawler/app/routers/__init__.py @@ -4,6 +4,9 @@ This package contains modular routers following Clean Architecture principles: - crawler: Content fetching and URL check endpoints (/api/v1/urls) - websites: Website registration and URL listing (/api/v1/websites) +- search: Hybrid full-text + vector search (/api/v1/search) +- pages: List indexed pages per website (/api/v1/pages) +- index: Content indexing management (/api/v1/index) - pdf: PDF conversion and parsing (/api/v1/pdf) - image: Image conversion (/api/v1/images) - docx: DOCX document generation and parsing (/api/v1/docx) @@ -14,8 +17,11 @@ from app.routers.crawler import router as crawler_router from app.routers.docx import router as docx_router from app.routers.image import router as image_router +from app.routers.index import router as index_router +from app.routers.pages import router as pages_router from app.routers.pdf import router as pdf_router from app.routers.pptx import router as pptx_router +from app.routers.search import router as search_router from app.routers.web import router as web_router from app.routers.websites import router as websites_router @@ -23,8 +29,11 @@ "crawler_router", "docx_router", "image_router", + "index_router", + "pages_router", "pdf_router", "pptx_router", + "search_router", "web_router", "websites_router", ] diff --git a/services/crawler/app/routers/crawler.py b/services/crawler/app/routers/crawler.py index e9143bd3ae..51c60933cf 100644 --- a/services/crawler/app/routers/crawler.py +++ b/services/crawler/app/routers/crawler.py @@ -4,7 +4,7 @@ from typing import Annotated -from fastapi import APIRouter, HTTPException, Query +from fastapi import APIRouter, HTTPException, Query, Request from loguru import logger from pydantic import HttpUrl @@ -14,13 +14,12 @@ PageContent, ) from app.services.crawler_service import get_crawler_service -from app.services.website_store import get_website_store_manager router = APIRouter(prefix="/api/v1/urls", tags=["Crawler"]) @router.post("/fetch", response_model=FetchUrlsResponse) -async def fetch_urls(request: FetchUrlsRequest): +async def fetch_urls(request: FetchUrlsRequest, http_request: Request): """ Fetch content from a list of specific URLs. @@ -28,8 +27,8 @@ async def fetch_urls(request: FetchUrlsRequest): falling back to live crawling for cache misses. """ try: - store_manager = get_website_store_manager() - cached, urls_to_crawl = store_manager.get_cached_pages(request.urls) + store_manager = http_request.app.state.pg_store_manager + cached, urls_to_crawl = await store_manager.get_cached_pages(request.urls) # Filter cached pages by word_count_threshold threshold = request.word_count_threshold diff --git a/services/crawler/app/routers/index.py b/services/crawler/app/routers/index.py new file mode 100644 index 0000000000..022af6dbf1 --- /dev/null +++ b/services/crawler/app/routers/index.py @@ -0,0 +1,70 @@ +""" +Index Router — Content indexing management endpoints. +""" + +from fastapi import APIRouter, HTTPException +from loguru import logger + +from app.models import IndexPageRequest, IndexPageResponse, IndexWebsiteResponse +from app.services.indexing_service import IndexingService + +router = APIRouter(prefix="/api/v1/index", tags=["Indexing"]) + +_indexing_service: IndexingService | None = None + + +def set_indexing_service(service: IndexingService) -> None: + global _indexing_service + _indexing_service = service + + +def _get_indexing_service() -> IndexingService: + if _indexing_service is None: + raise HTTPException(status_code=503, detail="Indexing service not initialized") + return _indexing_service + + +@router.post("/page", response_model=IndexPageResponse) +async def index_page(request: IndexPageRequest): + """Index a single page (chunk + embed + store).""" + try: + service = _get_indexing_service() + result = await service.index_page( + domain=request.domain, + url=request.url, + title=request.title, + content=request.content, + ) + return IndexPageResponse( + success=result["status"] in ("indexed", "skipped"), + url=result["url"], + chunks_indexed=result["chunks_indexed"], + status=result["status"], + error=result.get("error"), + ) + except HTTPException: + raise + except Exception: + logger.exception(f"Indexing failed for {request.url}") + raise HTTPException(status_code=500, detail="Indexing failed") from None + + +@router.post("/website/{domain}", response_model=IndexWebsiteResponse) +async def index_website(domain: str): + """Re-index all pages for a website.""" + try: + service = _get_indexing_service() + result = await service.index_website(domain) + return IndexWebsiteResponse( + success=True, + domain=result["domain"], + pages_indexed=result["pages_indexed"], + pages_skipped=result["pages_skipped"], + pages_failed=result["pages_failed"], + total_chunks=result["total_chunks"], + ) + except HTTPException: + raise + except Exception: + logger.exception(f"Website indexing failed for {domain}") + raise HTTPException(status_code=500, detail="Website indexing failed") from None diff --git a/services/crawler/app/routers/pages.py b/services/crawler/app/routers/pages.py new file mode 100644 index 0000000000..c12df87461 --- /dev/null +++ b/services/crawler/app/routers/pages.py @@ -0,0 +1,93 @@ +""" +Pages Router — List indexed pages for a website. +""" + +from fastapi import APIRouter, HTTPException, Query +from loguru import logger + +from app.models import PageListItem, PageListResponse +from app.services.database import get_pool + +router = APIRouter(prefix="/api/v1/pages", tags=["Pages"]) + + +@router.get("/{domain}", response_model=PageListResponse) +async def list_pages( + domain: str, + offset: int = Query(0, ge=0), + limit: int = Query(100, ge=1, le=1000), + status: str | None = Query(None, description="Filter by status (discovered, active, deleted, failed)"), + sort: str = Query("last_crawled_at", description="Sort field (last_crawled_at, discovered_at, word_count)"), +): + """List all crawled pages for a website with indexing status.""" + try: + pool = get_pool() + + valid_sorts = {"last_crawled_at", "discovered_at", "word_count"} + sort_field = sort if sort in valid_sorts else "last_crawled_at" + order = "DESC" + + async with pool.acquire() as conn: + # Build query with optional status filter + conditions = ["wu.domain = $1", "wu.content_hash IS NOT NULL"] + params: list = [domain] + param_idx = 2 + + if status: + conditions.append(f"wu.status = ${param_idx}") + params.append(status) + param_idx += 1 + + where_clause = " AND ".join(conditions) + + # Main query with chunk count via LEFT JOIN + params.extend([limit, offset]) + rows = await conn.fetch( + f"""SELECT wu.url, wu.title, wu.word_count, wu.status, wu.content_hash, + wu.last_crawled_at, wu.discovered_at, + COALESCE(c.chunks_count, 0) AS chunks_count + FROM website_urls wu + LEFT JOIN ( + SELECT url, COUNT(*) AS chunks_count + FROM chunks + GROUP BY url + ) c ON c.url = wu.url + WHERE {where_clause} + ORDER BY wu.{sort_field} {order} NULLS LAST + LIMIT ${param_idx} OFFSET ${param_idx + 1}""", + *params, + ) + + # Total count + total = await conn.fetchval( + f"SELECT COUNT(*) FROM website_urls wu WHERE {where_clause}", + *params[: param_idx - 1], + ) + + pages = [ + PageListItem( + url=r["url"], + title=r["title"], + word_count=r["word_count"] or 0, + status=r["status"], + content_hash=r["content_hash"], + last_crawled_at=r["last_crawled_at"].isoformat() if r["last_crawled_at"] else None, + discovered_at=r["discovered_at"].isoformat() if r["discovered_at"] else None, + chunks_count=r["chunks_count"], + indexed=r["chunks_count"] > 0, + ) + for r in rows + ] + + return PageListResponse( + domain=domain, + pages=pages, + total=total, + offset=offset, + has_more=offset + limit < total, + ) + except HTTPException: + raise + except Exception: + logger.exception(f"Error listing pages for {domain}") + raise HTTPException(status_code=500, detail="Failed to list pages") from None diff --git a/services/crawler/app/routers/search.py b/services/crawler/app/routers/search.py new file mode 100644 index 0000000000..3de50934d3 --- /dev/null +++ b/services/crawler/app/routers/search.py @@ -0,0 +1,78 @@ +""" +Search Router — Hybrid full-text + vector search across indexed website content. +""" + +from fastapi import APIRouter, HTTPException +from loguru import logger + +from app.models import SearchRequest, SearchResponse, SearchResultItem +from app.services.search_service import SearchService + +router = APIRouter(prefix="/api/v1/search", tags=["Search"]) + +_search_service: SearchService | None = None + + +def set_search_service(service: SearchService) -> None: + global _search_service + _search_service = service + + +def _get_search_service() -> SearchService: + if _search_service is None: + raise HTTPException(status_code=503, detail="Search service not initialized") + return _search_service + + +@router.post("", response_model=SearchResponse) +async def search_all(request: SearchRequest): + """Search across all indexed website content.""" + try: + service = _get_search_service() + results = await service.search(query=request.query, limit=request.limit) + return SearchResponse( + query=request.query, + results=[ + SearchResultItem( + url=r.url, + title=r.title, + chunk_content=r.chunk_content, + chunk_index=r.chunk_index, + score=r.score, + ) + for r in results + ], + total=len(results), + ) + except HTTPException: + raise + except Exception: + logger.exception("Search failed") + raise HTTPException(status_code=500, detail="Search failed") from None + + +@router.post("/{domain}", response_model=SearchResponse) +async def search_domain(domain: str, request: SearchRequest): + """Search within a specific website's indexed content.""" + try: + service = _get_search_service() + results = await service.search(query=request.query, domain=domain, limit=request.limit) + return SearchResponse( + query=request.query, + results=[ + SearchResultItem( + url=r.url, + title=r.title, + chunk_content=r.chunk_content, + chunk_index=r.chunk_index, + score=r.score, + ) + for r in results + ], + total=len(results), + ) + except HTTPException: + raise + except Exception: + logger.exception(f"Search failed for domain {domain}") + raise HTTPException(status_code=500, detail="Search failed") from None diff --git a/services/crawler/app/routers/websites.py b/services/crawler/app/routers/websites.py index f31d762967..7db7a1814a 100644 --- a/services/crawler/app/routers/websites.py +++ b/services/crawler/app/routers/websites.py @@ -2,21 +2,37 @@ Websites Router — Website registration and URL listing endpoints. """ -from fastapi import APIRouter, HTTPException, Query +from datetime import datetime + +from fastapi import APIRouter, HTTPException, Query, Request from loguru import logger -from app.models import RegisterWebsiteRequest, WebsiteUrl, WebsiteUrlsResponse +from app.models import RegisterWebsiteRequest, WebsiteInfoResponse, WebsiteUrl, WebsiteUrlsResponse +from app.services.pg_website_store import PgWebsiteStoreManager from app.services.scheduler import trigger_scan -from app.services.website_store import get_website_store_manager router = APIRouter(prefix="/api/v1/websites", tags=["Websites"]) +def _get_manager(request: Request) -> PgWebsiteStoreManager: + return request.app.state.pg_store_manager + + +def _format_timestamp(val) -> str | None: + if val is None: + return None + if isinstance(val, datetime): + return val.isoformat() + if isinstance(val, (int, float)): + return datetime.fromtimestamp(val).isoformat() + return str(val) + + @router.post("") -async def register_website(request: RegisterWebsiteRequest): +async def register_website(request: RegisterWebsiteRequest, http_request: Request): try: - manager = get_website_store_manager() - result = manager.register_website( + manager = _get_manager(http_request) + result = await manager.register_website( domain=request.domain, scan_interval=request.scan_interval, ) @@ -27,11 +43,39 @@ async def register_website(request: RegisterWebsiteRequest): raise HTTPException(status_code=500, detail="Failed to register website") from None +@router.get("/{domain}", response_model=WebsiteInfoResponse) +async def get_website_info(domain: str, http_request: Request): + try: + manager = _get_manager(http_request) + website = await manager.get_website(domain) + + if not website: + raise HTTPException(status_code=404, detail=f"Website not found: {domain}") + + return WebsiteInfoResponse( + domain=website["domain"], + title=website.get("title"), + description=website.get("description"), + page_count=website.get("page_count", 0), + status=website.get("status", "idle"), + scan_interval=website.get("scan_interval", 21600), + last_scanned_at=_format_timestamp(website.get("last_scanned_at")), + error=website.get("error"), + created_at=_format_timestamp(website.get("created_at")), + updated_at=_format_timestamp(website.get("updated_at")), + ) + except HTTPException: + raise + except Exception: + logger.exception("Error getting website info") + raise HTTPException(status_code=500, detail="Failed to get website info") from None + + @router.delete("/{domain}") -async def deregister_website(domain: str): +async def deregister_website(domain: str, http_request: Request): try: - manager = get_website_store_manager() - deleted = manager.remove_website(domain) + manager = _get_manager(http_request) + deleted = await manager.remove_website(domain) if not deleted: raise HTTPException(status_code=404, detail=f"Website not found: {domain}") return {"domain": domain, "deleted": True} @@ -45,19 +89,21 @@ async def deregister_website(domain: str): @router.get("/{domain}/urls", response_model=WebsiteUrlsResponse) async def get_website_urls( domain: str, + http_request: Request, offset: int = Query(0, ge=0), limit: int = Query(100, ge=1, le=1000), status: str | None = Query(None), ): try: - manager = get_website_store_manager() - website = manager.get_website(domain) + manager = _get_manager(http_request) + website = await manager.get_website(domain) + if not website: raise HTTPException(status_code=404, detail=f"Website not found: {domain}") site_store = manager.get_site_store(domain) - urls_data = site_store.get_urls_page(offset=offset, limit=limit, status=status) - total = site_store.get_total_count(status=status) + urls_data = await site_store.get_urls_page(offset=offset, limit=limit, status=status) + total = await site_store.get_total_count(status=status) urls = [ WebsiteUrl( diff --git a/services/crawler/app/services/chunking_service.py b/services/crawler/app/services/chunking_service.py new file mode 100644 index 0000000000..385324bda5 --- /dev/null +++ b/services/crawler/app/services/chunking_service.py @@ -0,0 +1,159 @@ +""" +Fixed-size content chunking for search indexing. + +Splits text into overlapping chunks of ~512 tokens (~2048 chars), +splitting on paragraph/sentence boundaries where possible. +""" + +import re +from dataclasses import dataclass + +CHUNK_SIZE = 2048 +CHUNK_OVERLAP = 200 +MIN_CHUNK_LENGTH = 50 + + +@dataclass +class ContentChunk: + content: str + index: int + + +def chunk_content( + content: str, + title: str | None = None, + chunk_size: int = CHUNK_SIZE, + chunk_overlap: int = CHUNK_OVERLAP, + min_chunk_length: int = MIN_CHUNK_LENGTH, +) -> list[ContentChunk]: + if not content or not content.strip(): + return [] + + text = content.strip() + prefix = f"{title.strip()}\n\n" if title and title.strip() else "" + + # Split into paragraphs first + paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()] + + chunks: list[ContentChunk] = [] + current = prefix + idx = 0 + + for para in paragraphs: + # If adding this paragraph exceeds chunk_size, finalize current chunk + if current and len(current) + len(para) + 2 > chunk_size: + if len(current.strip()) >= min_chunk_length: + chunks.append(ContentChunk(content=current.strip(), index=idx)) + idx += 1 + + # Start new chunk with overlap from the end of current + overlap_text = current[-chunk_overlap:] if len(current) > chunk_overlap else current + current = prefix + overlap_text.lstrip() + + if len(para) > chunk_size: + # Large paragraph: split by sentences + sentences = _split_sentences(para) + for sentence in sentences: + if len(current) + len(sentence) + 1 > chunk_size: + if len(current.strip()) >= min_chunk_length: + chunks.append(ContentChunk(content=current.strip(), index=idx)) + idx += 1 + overlap_text = current[-chunk_overlap:] if len(current) > chunk_overlap else current + current = prefix + overlap_text.lstrip() + + if len(sentence) > chunk_size: + # Very long sentence: hard split + for start in range(0, len(sentence), chunk_size - chunk_overlap): + piece = sentence[start : start + chunk_size] + if len(piece.strip()) >= min_chunk_length: + chunks.append(ContentChunk(content=(prefix + piece).strip(), index=idx)) + idx += 1 + current = prefix + else: + current = current + " " + sentence if current.strip() else prefix + sentence + else: + current = current + "\n\n" + para if current.strip() else prefix + para + + # Flush remaining + if current and len(current.strip()) >= min_chunk_length: + chunks.append(ContentChunk(content=current.strip(), index=idx)) + + return chunks + + +_ABBREVIATIONS = frozenset( + { + "Mr", + "Mrs", + "Ms", + "Dr", + "Jr", + "Sr", + "Prof", + "St", + "vs", + "etc", + "approx", + "Inc", + "Ltd", + "Corp", + "Co", + "Dept", + "Univ", + "Gen", + "Gov", + "Sgt", + "Cpl", + "Pvt", + "Capt", + "Lt", + "Col", + "No", + } +) + +_SENTENCE_SPLIT = re.compile(r'([.!?])\s+(?=[A-Z"])') + + +def _split_sentences(text: str) -> list[str]: + parts = _SENTENCE_SPLIT.split(text) + + # _SENTENCE_SPLIT captures the punctuation as group(1), so the result + # alternates: [text, punct, text, punct, text, ...] + # Reassemble by gluing each punctuation back onto the preceding segment. + raw: list[str] = [] + i = 0 + while i < len(parts): + segment = parts[i] + if i + 1 < len(parts) and parts[i + 1] in ".!?": + segment += parts[i + 1] + i += 2 + else: + i += 1 + stripped = segment.strip() + if stripped: + raw.append(stripped) + + # Rejoin segments that were split after an abbreviation or single capital + sentences: list[str] = [] + for seg in raw: + if sentences and _is_abbreviation_ending(sentences[-1]): + sentences[-1] += " " + seg + else: + sentences.append(seg) + + return sentences + + +def _is_abbreviation_ending(s: str) -> bool: + if not s or s[-1] != ".": + return False + # Single capital letter (e.g. "U." in "U.S.") + if len(s) >= 2 and s[-2].isupper(): + return True + # Known abbreviation: find the last word before the trailing dot + last_dot = s.rfind(".", 0, len(s) - 1) + last_space = s.rfind(" ", 0, len(s) - 1) + start = max(last_dot, last_space) + 1 + word = s[start:-1] + return word in _ABBREVIATIONS diff --git a/services/crawler/app/services/database.py b/services/crawler/app/services/database.py new file mode 100644 index 0000000000..98fe15ff04 --- /dev/null +++ b/services/crawler/app/services/database.py @@ -0,0 +1,57 @@ +""" +Async PostgreSQL connection pool using asyncpg. + +Provides a singleton pool tied to FastAPI's lifespan for the tale_crawler_search database. +""" + +import os + +import asyncpg +from loguru import logger + +from app.config import settings + +_pool: asyncpg.Pool | None = None + + +def _get_database_url() -> str: + if settings.database_url: + return settings.database_url + if url := os.environ.get("DATABASE_URL"): + return url + password = os.environ.get("DB_PASSWORD", "tale_password_change_me") + return f"postgresql://tale:{password}@db:5432/tale_crawler_search" + + +async def init_pool() -> asyncpg.Pool: + global _pool + if _pool is not None: + return _pool + + dsn = _get_database_url() + _pool = await asyncpg.create_pool(dsn, min_size=5, max_size=25) + logger.info("PostgreSQL connection pool initialized") + + # Create HNSW index if embeddings exist but index doesn't. + # May fail when embedding column has no dimension yet (empty table). + try: + async with _pool.acquire() as conn: + await conn.execute("SELECT create_chunks_hnsw_index()") + except Exception as e: + logger.warning(f"HNSW index creation deferred: {e}") + + return _pool + + +def get_pool() -> asyncpg.Pool: + if _pool is None: + raise RuntimeError("Database pool not initialized. Call init_pool() first.") + return _pool + + +async def close_pool() -> None: + global _pool + if _pool is not None: + await _pool.close() + _pool = None + logger.info("PostgreSQL connection pool closed") diff --git a/services/crawler/app/services/embedding_service.py b/services/crawler/app/services/embedding_service.py new file mode 100644 index 0000000000..ea1cc60839 --- /dev/null +++ b/services/crawler/app/services/embedding_service.py @@ -0,0 +1,79 @@ +""" +OpenAI-compatible embedding generation service. + +Uses the async OpenAI client to generate embeddings via any OpenAI-compatible API. +""" + +import asyncio + +from loguru import logger +from openai import AsyncOpenAI + +from app.config import settings + +MAX_BATCH_SIZE = 2048 +MAX_CONCURRENT_REQUESTS = 3 +RETRY_DELAY_SECONDS = 1.0 + + +class EmbeddingService: + def __init__(self, api_key: str, base_url: str | None, model: str, dimensions: int): + self._client = AsyncOpenAI(api_key=api_key, base_url=base_url) + self._model = model + self._dimensions = dimensions + self._semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS) + + @property + def dimensions(self) -> int: + return self._dimensions + + async def _embed_batch(self, batch: list[str]) -> list[list[float]]: + async with self._semaphore: + try: + response = await self._client.embeddings.create( + model=self._model, + input=batch, + dimensions=self._dimensions, + ) + return [item.embedding for item in response.data] + except Exception: + logger.warning(f"Embedding request failed, retrying in {RETRY_DELAY_SECONDS}s") + await asyncio.sleep(RETRY_DELAY_SECONDS) + response = await self._client.embeddings.create( + model=self._model, + input=batch, + dimensions=self._dimensions, + ) + return [item.embedding for item in response.data] + + async def embed_texts(self, texts: list[str]) -> list[list[float]]: + if not texts: + return [] + + all_embeddings: list[list[float]] = [] + for i in range(0, len(texts), MAX_BATCH_SIZE): + batch = texts[i : i + MAX_BATCH_SIZE] + batch_embeddings = await self._embed_batch(batch) + all_embeddings.extend(batch_embeddings) + + return all_embeddings + + async def embed_query(self, query: str) -> list[float]: + result = await self.embed_texts([query]) + return result[0] + + +_embedding_service: EmbeddingService | None = None + + +def get_embedding_service() -> EmbeddingService: + global _embedding_service + if _embedding_service is None: + _embedding_service = EmbeddingService( + api_key=settings.get_openai_api_key(), + base_url=settings.get_openai_base_url(), + model=settings.get_embedding_model(), + dimensions=settings.embedding_dimensions, + ) + logger.info(f"Embedding service: model={settings.get_embedding_model()}, dims={settings.embedding_dimensions}") + return _embedding_service diff --git a/services/crawler/app/services/indexing_service.py b/services/crawler/app/services/indexing_service.py new file mode 100644 index 0000000000..44f9e58164 --- /dev/null +++ b/services/crawler/app/services/indexing_service.py @@ -0,0 +1,133 @@ +""" +Content indexing pipeline: chunk → embed → store in PostgreSQL. +""" + +import asyncio +import hashlib +import logging + +import asyncpg + +from app.services.chunking_service import chunk_content +from app.services.embedding_service import EmbeddingService + +logger = logging.getLogger(__name__) + +INDEXING_CONCURRENCY = 5 + + +def _sha256(content: str) -> str: + return hashlib.sha256(content.encode()).hexdigest() + + +class IndexingService: + def __init__(self, pool: asyncpg.Pool, embedding_service: EmbeddingService): + self._pool = pool + self._embedding = embedding_service + + async def index_page(self, domain: str, url: str, title: str | None, content: str) -> dict: + content_hash = _sha256(content) + + # Check if already indexed with same hash + async with self._pool.acquire() as conn: + existing_hash = await conn.fetchval("SELECT content_hash FROM chunks WHERE url = $1 LIMIT 1", url) + if existing_hash == content_hash: + return {"url": url, "status": "skipped", "chunks_indexed": 0} + + # Chunk content + chunks = chunk_content(content, title=title) + if not chunks: + return {"url": url, "status": "empty", "chunks_indexed": 0} + + # Generate embeddings + texts = [c.content for c in chunks] + try: + embeddings = await self._embedding.embed_texts(texts) + except Exception: + logger.exception(f"Embedding failed for {url}") + return {"url": url, "status": "error", "chunks_indexed": 0, "error": "embedding_failed"} + + # Store in DB (ensure website_urls entry exists, delete old chunks → insert new) + async with self._pool.acquire() as conn, conn.transaction(): + await conn.execute( + """INSERT INTO website_urls (domain, url, title, content_hash, status, discovered_at, last_crawled_at) + VALUES ($1, $2, $3, $4, 'active', NOW(), NOW()) + ON CONFLICT (domain, url) DO UPDATE SET + title = COALESCE(EXCLUDED.title, website_urls.title), + content_hash = EXCLUDED.content_hash, + last_crawled_at = NOW()""", + domain, + url, + title, + content_hash, + ) + await conn.execute("DELETE FROM chunks WHERE url = $1", url) + await conn.executemany( + """INSERT INTO chunks (domain, url, title, content_hash, chunk_index, chunk_content, embedding) + VALUES ($1, $2, $3, $4, $5, $6, $7::vector)""", + [ + (domain, url, title, content_hash, chunk.index, chunk.content, str(embeddings[i])) + for i, chunk in enumerate(chunks) + ], + ) + + logger.info(f"Indexed {len(chunks)} chunks for {url}") + return {"url": url, "status": "indexed", "chunks_indexed": len(chunks)} + + async def index_website(self, domain: str) -> dict: + indexed = 0 + skipped = 0 + failed = 0 + total_chunks = 0 + sem = asyncio.Semaphore(INDEXING_CONCURRENCY) + page_size = 100 + offset = 0 + + while True: + async with self._pool.acquire() as conn: + rows = await conn.fetch( + """SELECT url, title, content FROM website_urls + WHERE domain = $1 AND content IS NOT NULL + ORDER BY id + LIMIT $2 OFFSET $3""", + domain, + page_size, + offset, + ) + + if not rows: + break + + async def _index_one(row: asyncpg.Record) -> dict: + async with sem: + return await self.index_page(domain, row["url"], row["title"], row["content"]) + + results = await asyncio.gather(*[_index_one(row) for row in rows], return_exceptions=True) + + for result in results: + if isinstance(result, Exception): + logger.exception(f"Indexing task failed for {domain}: {result}") + failed += 1 + elif result["status"] == "indexed": + indexed += 1 + total_chunks += result["chunks_indexed"] + elif result["status"] == "skipped": + skipped += 1 + else: + failed += 1 + + offset += page_size + + return { + "domain": domain, + "pages_indexed": indexed, + "pages_skipped": skipped, + "pages_failed": failed, + "total_chunks": total_chunks, + } + + async def delete_page_chunks(self, url: str) -> int: + async with self._pool.acquire() as conn: + result = await conn.execute("DELETE FROM chunks WHERE url = $1", url) + count = int(result.split()[-1]) if result else 0 + return count diff --git a/services/crawler/app/services/pg_website_store.py b/services/crawler/app/services/pg_website_store.py new file mode 100644 index 0000000000..1d01ecac12 --- /dev/null +++ b/services/crawler/app/services/pg_website_store.py @@ -0,0 +1,340 @@ +""" +Async PostgreSQL-backed website store, replacing the SQLite multi-DB architecture. + +PgWebsiteStore: per-domain URL operations (scoped by domain column). +PgWebsiteStoreManager: website registry + factory for PgWebsiteStore instances. +""" + +import json +import logging +from datetime import UTC, datetime +from urllib.parse import urlparse + +import asyncpg + +logger = logging.getLogger(__name__) + + +class PgWebsiteStore: + """Per-domain URL operations backed by PostgreSQL.""" + + def __init__(self, pool: asyncpg.Pool, domain: str): + self._pool = pool + self._domain = domain + + async def save_discovered_urls(self, urls: list[dict]) -> int: + if not urls: + return 0 + + async with self._pool.acquire() as conn: + await conn.executemany( + """INSERT INTO website_urls (domain, url, discovered_at) + VALUES ($1, $2, NOW()) + ON CONFLICT (domain, url) DO NOTHING""", + [(self._domain, u["url"]) for u in urls], + ) + count = await conn.fetchval("SELECT COUNT(*) FROM website_urls WHERE domain = $1", self._domain) + logger.info(f"Saved discovered URLs for {self._domain}, total: {count}") + return len(urls) + + async def get_urls_page(self, offset: int = 0, limit: int = 100, status: str | None = None) -> list[dict]: + async with self._pool.acquire() as conn: + if status: + rows = await conn.fetch( + """SELECT url, content_hash, status, last_crawled_at + FROM website_urls + WHERE domain = $1 AND content_hash IS NOT NULL AND status = $2 + ORDER BY id LIMIT $3 OFFSET $4""", + self._domain, + status, + limit, + offset, + ) + else: + rows = await conn.fetch( + """SELECT url, content_hash, status, last_crawled_at + FROM website_urls + WHERE domain = $1 AND content_hash IS NOT NULL + ORDER BY id LIMIT $2 OFFSET $3""", + self._domain, + limit, + offset, + ) + return [ + { + "url": r["url"], + "content_hash": r["content_hash"], + "status": r["status"], + "last_crawled_at": r["last_crawled_at"].timestamp() if r["last_crawled_at"] else None, + } + for r in rows + ] + + async def get_urls_needing_recrawl(self, limit: int = 20, crawled_before: float | None = None) -> list[str]: + async with self._pool.acquire() as conn: + if crawled_before is not None: + ts = datetime.fromtimestamp(crawled_before, tz=UTC) + rows = await conn.fetch( + """SELECT url FROM website_urls + WHERE domain = $1 AND status != 'deleted' + AND (last_crawled_at IS NULL OR last_crawled_at < $2) + ORDER BY CASE WHEN content_hash IS NULL THEN 0 ELSE 1 END, + last_crawled_at ASC NULLS FIRST + LIMIT $3""", + self._domain, + ts, + limit, + ) + else: + rows = await conn.fetch( + """SELECT url FROM website_urls + WHERE domain = $1 AND status != 'deleted' + ORDER BY CASE WHEN content_hash IS NULL THEN 0 ELSE 1 END, + last_crawled_at ASC NULLS FIRST + LIMIT $2""", + self._domain, + limit, + ) + return [r["url"] for r in rows] + + async def increment_fail_count(self, urls: list[str]) -> None: + if not urls: + return + async with self._pool.acquire() as conn: + await conn.executemany( + """UPDATE website_urls + SET fail_count = fail_count + 1, last_crawled_at = NOW() + WHERE domain = $1 AND url = $2""", + [(self._domain, url) for url in urls], + ) + + async def update_content_hashes(self, updates: list[dict]) -> None: + if not updates: + return + async with self._pool.acquire() as conn: + await conn.executemany( + """UPDATE website_urls + SET content_hash = $3, status = $4, last_crawled_at = NOW(), + title = $5, content = $6, word_count = $7, + metadata = $8::jsonb, structured_data = $9::jsonb, + fail_count = 0 + WHERE domain = $1 AND url = $2""", + [ + ( + self._domain, + u["url"], + u["content_hash"], + u.get("status", "active"), + u.get("title"), + u.get("content"), + u.get("word_count"), + u.get("metadata"), + u.get("structured_data"), + ) + for u in updates + ], + ) + + async def mark_urls_deleted(self, urls: list[str]) -> None: + if not urls: + return + async with self._pool.acquire() as conn: + await conn.executemany( + "UPDATE website_urls SET status = 'deleted' WHERE domain = $1 AND url = $2", + [(self._domain, url) for url in urls], + ) + + async def get_cache_headers(self, urls: list[str]) -> dict[str, dict]: + if not urls: + return {} + async with self._pool.acquire() as conn: + rows = await conn.fetch( + """SELECT url, etag, last_modified FROM website_urls + WHERE domain = $1 AND url = ANY($2) + AND (etag IS NOT NULL OR last_modified IS NOT NULL)""", + self._domain, + urls, + ) + return {r["url"]: {"etag": r["etag"], "last_modified": r["last_modified"]} for r in rows} + + async def update_cache_headers(self, updates: list[dict]) -> None: + if not updates: + return + async with self._pool.acquire() as conn: + await conn.executemany( + "UPDATE website_urls SET etag = $3, last_modified = $4 WHERE domain = $1 AND url = $2", + [(self._domain, u["url"], u.get("etag"), u.get("last_modified")) for u in updates], + ) + + async def touch_crawled_at(self, urls: list[str]) -> None: + if not urls: + return + async with self._pool.acquire() as conn: + await conn.executemany( + "UPDATE website_urls SET last_crawled_at = NOW() WHERE domain = $1 AND url = $2", + [(self._domain, url) for url in urls], + ) + + async def get_total_count(self, status: str | None = None) -> int: + async with self._pool.acquire() as conn: + if status: + return await conn.fetchval( + """SELECT COUNT(*) FROM website_urls + WHERE domain = $1 AND content_hash IS NOT NULL AND status = $2""", + self._domain, + status, + ) + return await conn.fetchval( + "SELECT COUNT(*) FROM website_urls WHERE domain = $1 AND content_hash IS NOT NULL", + self._domain, + ) + + async def get_cached_pages(self, urls: list[str]) -> list[dict]: + if not urls: + return [] + async with self._pool.acquire() as conn: + rows = await conn.fetch( + """SELECT url, title, content, word_count, metadata, structured_data + FROM website_urls + WHERE domain = $1 AND url = ANY($2) AND content IS NOT NULL""", + self._domain, + urls, + ) + return [ + { + "url": r["url"], + "title": r["title"], + "content": r["content"], + "word_count": r["word_count"] or 0, + "metadata": json.loads(r["metadata"]) if r["metadata"] else None, + "structured_data": json.loads(r["structured_data"]) if r["structured_data"] else None, + } + for r in rows + ] + + +class PgWebsiteStoreManager: + """Website registry + factory for PgWebsiteStore instances.""" + + def __init__(self, pool: asyncpg.Pool): + self._pool = pool + self._stores: dict[str, PgWebsiteStore] = {} + + async def register_website(self, domain: str, scan_interval: int = 21600) -> dict: + async with self._pool.acquire() as conn: + await conn.execute( + """INSERT INTO websites (domain, scan_interval, created_at, updated_at) + VALUES ($1, $2, NOW(), NOW()) + ON CONFLICT(domain) DO UPDATE SET + scan_interval = EXCLUDED.scan_interval, + updated_at = NOW()""", + domain, + scan_interval, + ) + logger.info(f"Registered website: {domain} (interval={scan_interval}s)") + return {"domain": domain, "scan_interval": scan_interval, "status": "idle"} + + async def update_website_metadata( + self, + domain: str, + title: str | None = None, + description: str | None = None, + page_count: int | None = None, + ) -> None: + async with self._pool.acquire() as conn: + await conn.execute( + """UPDATE websites SET + title = COALESCE($2, title), + description = COALESCE($3, description), + page_count = COALESCE($4, page_count), + updated_at = NOW() + WHERE domain = $1""", + domain, + title, + description, + page_count, + ) + + async def remove_website(self, domain: str) -> bool: + self._stores.pop(domain, None) + async with self._pool.acquire() as conn: + async with conn.transaction(): + await conn.execute("DELETE FROM chunks WHERE domain = $1", domain) + await conn.execute("DELETE FROM website_urls WHERE domain = $1", domain) + result = await conn.execute("DELETE FROM websites WHERE domain = $1", domain) + deleted = result == "DELETE 1" + if deleted: + logger.info(f"Removed website: {domain}") + return deleted + + async def get_due_websites(self) -> list[dict]: + async with self._pool.acquire() as conn: + rows = await conn.fetch( + """SELECT domain, status, scan_interval, last_scanned_at, error + FROM websites + WHERE status != 'scanning' + AND (last_scanned_at IS NULL + OR last_scanned_at + make_interval(secs => scan_interval) < NOW())""" + ) + return [dict(r) for r in rows] + + async def update_scan_status(self, domain: str, status: str, error: str | None = None) -> None: + async with self._pool.acquire() as conn: + await conn.execute( + "UPDATE websites SET status = $2, error = $3, updated_at = NOW() WHERE domain = $1", + domain, + status, + error, + ) + + async def update_last_scanned(self, domain: str) -> None: + async with self._pool.acquire() as conn: + await conn.execute( + "UPDATE websites SET last_scanned_at = NOW(), updated_at = NOW() WHERE domain = $1", + domain, + ) + + async def get_website(self, domain: str) -> dict | None: + async with self._pool.acquire() as conn: + row = await conn.fetchrow( + """SELECT domain, title, description, page_count, status, scan_interval, + last_scanned_at, error, created_at, updated_at + FROM websites WHERE domain = $1""", + domain, + ) + return dict(row) if row else None + + def get_site_store(self, domain: str) -> PgWebsiteStore: + if domain not in self._stores: + self._stores[domain] = PgWebsiteStore(self._pool, domain) + return self._stores[domain] + + async def get_cached_pages(self, urls: list[str]) -> tuple[list[dict], list[str]]: + if not urls: + return [], [] + + by_domain: dict[str, list[str]] = {} + for url in urls: + domain = urlparse(url).netloc + by_domain.setdefault(domain, []).append(url) + + cached: list[dict] = [] + to_crawl: list[str] = [] + + for domain, domain_urls in by_domain.items(): + website = await self.get_website(domain) + if not website: + to_crawl.extend(domain_urls) + continue + + site_store = self.get_site_store(domain) + hits = await site_store.get_cached_pages(domain_urls) + hit_urls = {p["url"] for p in hits} + cached.extend(hits) + to_crawl.extend(u for u in domain_urls if u not in hit_urls) + + return cached, to_crawl + + async def close(self) -> None: + self._stores.clear() + logger.info("PgWebsiteStoreManager closed") diff --git a/services/crawler/app/services/scheduler.py b/services/crawler/app/services/scheduler.py index f95b98d26f..9991c69ecb 100644 --- a/services/crawler/app/services/scheduler.py +++ b/services/crawler/app/services/scheduler.py @@ -2,8 +2,7 @@ Background scheduler for autonomous website scanning. Periodically checks for websites due for scanning and runs discovery + content -hashing in parallel (bounded by Semaphore). Each website writes to its own -SQLite file so there is zero lock contention between concurrent scans. +hashing in parallel (bounded by Semaphore). """ import asyncio @@ -15,7 +14,8 @@ import httpx from app.services.crawler_service import CrawlerService -from app.services.website_store import WebsiteStore, WebsiteStoreManager +from app.services.indexing_service import IndexingService +from app.services.pg_website_store import PgWebsiteStore, PgWebsiteStoreManager logger = logging.getLogger(__name__) @@ -24,6 +24,7 @@ POLL_INTERVAL = 60 # seconds _HEAD_TIMEOUT = 10 _HEAD_CONCURRENCY = 5 +_HEAD_BATCH_SIZE = 50 _scan_trigger: asyncio.Event | None = None @@ -39,8 +40,9 @@ def trigger_scan(): async def run_scheduler( - store_manager: WebsiteStoreManager, + store_manager: PgWebsiteStoreManager, crawler_service: CrawlerService, + indexing_service: IndexingService | None = None, ): global _scan_trigger _scan_trigger = asyncio.Event() @@ -49,15 +51,22 @@ async def run_scheduler( async def bounded_scan(domain: str): async with sem: - await _scan_website(domain, store_manager, crawler_service) + await _scan_website(domain, store_manager, crawler_service, indexing_service) while True: try: - due = store_manager.get_due_websites() + due = await store_manager.get_due_websites() if due: logger.info(f"Scheduler: {len(due)} website(s) due for scanning") tasks = [asyncio.create_task(bounded_scan(w["domain"])) for w in due] - await asyncio.gather(*tasks, return_exceptions=True) + results = await asyncio.gather(*tasks, return_exceptions=True) + for website, result in zip(due, results): + if isinstance(result, BaseException): + logger.error(f"Scheduler: scan failed for {website['domain']}: {result}") + try: + await store_manager.update_scan_status(website["domain"], "error", str(result)) + except Exception: + logger.exception(f"Scheduler: failed to update error status for {website['domain']}") except Exception: logger.exception("Scheduler loop error") @@ -71,10 +80,10 @@ async def bounded_scan(domain: str): async def _head_check( urls: list[str], - site_store: WebsiteStore, + site_store: PgWebsiteStore, ) -> tuple[list[str], list[str]]: """Split URLs into (unchanged, needs_crawl) using conditional HEAD requests.""" - stored = site_store.get_cache_headers(urls) + stored = await site_store.get_cache_headers(urls) urls_with_headers = [u for u in urls if u in stored] urls_without_headers = [u for u in urls if u not in stored] @@ -117,14 +126,14 @@ async def check_one(client: httpx.AsyncClient, url: str): await asyncio.gather(*[check_one(client, u) for u in urls_with_headers]) if header_updates: - site_store.update_cache_headers(header_updates) + await site_store.update_cache_headers(header_updates) return unchanged, needs_crawl async def _seed_cache_headers( urls: list[str], - site_store: WebsiteStore, + site_store: PgWebsiteStore, ) -> None: """Seed etag/last_modified via HEAD for URLs that just completed their first crawl.""" sem = asyncio.Semaphore(_HEAD_CONCURRENCY) @@ -149,17 +158,59 @@ async def seed_one(client: httpx.AsyncClient, url: str): await asyncio.gather(*[seed_one(client, u) for u in urls]) if header_updates: - site_store.update_cache_headers(header_updates) + await site_store.update_cache_headers(header_updates) logger.info(f"Seeded cache headers for {len(header_updates)}/{len(urls)} URLs") +def _is_homepage(url: str, domain: str) -> bool: + """Check if a URL is the homepage (root path) of the domain.""" + from urllib.parse import urlparse + + parsed = urlparse(url) + return parsed.netloc == domain and parsed.path in ("", "/") + + +def _extract_meta_description(structured_data: dict | None) -> str | None: + """Extract meta description from structured data.""" + if not structured_data: + return None + meta = structured_data.get("meta", {}) + if desc := meta.get("description"): + return desc + og = structured_data.get("opengraph", {}) + if desc := og.get("og:description"): + return desc + return None + + +async def _bulk_head_check( + all_urls: list[str], + site_store: PgWebsiteStore, +) -> tuple[list[str], list[str], set[str]]: + """HEAD check all URLs in batches, return (unchanged, needs_crawl, urls_with_prior_headers).""" + all_unchanged: list[str] = [] + all_needs_crawl: list[str] = [] + all_had_headers: set[str] = set() + + for i in range(0, len(all_urls), _HEAD_BATCH_SIZE): + batch = all_urls[i : i + _HEAD_BATCH_SIZE] + had_headers = await site_store.get_cache_headers(batch) + all_had_headers.update(had_headers) + unchanged, needs_crawl = await _head_check(batch, site_store) + all_unchanged.extend(unchanged) + all_needs_crawl.extend(needs_crawl) + + return all_unchanged, all_needs_crawl, all_had_headers + + async def _scan_website( domain: str, - store_manager: WebsiteStoreManager, + store_manager: PgWebsiteStoreManager, crawler_service: CrawlerService, + indexing_service: IndexingService | None = None, ): site_store = store_manager.get_site_store(domain) - store_manager.update_scan_status(domain, "scanning") + await store_manager.update_scan_status(domain, "scanning") try: if not crawler_service.initialized: @@ -168,35 +219,41 @@ async def _scan_website( # Phase 1: Discover new URLs logger.info(f"Scan [{domain}]: Phase 1 — discovering URLs") discovered = await crawler_service.discover_urls(domain=domain, max_urls=-1) - site_store.save_discovered_urls(discovered) + await site_store.save_discovered_urls(discovered) logger.info(f"Scan [{domain}]: discovered {len(discovered)} URLs") - # Phase 2: Crawl URLs in batches and cache content + hashes + # Phase 2: Bulk HEAD check — filter unchanged URLs up front scan_start = time.time() + all_urls = await site_store.get_urls_needing_recrawl(limit=10000, crawled_before=scan_start) + if not all_urls: + logger.info(f"Scan [{domain}]: no URLs need recrawling") + await store_manager.update_last_scanned(domain) + await store_manager.update_scan_status(domain, "active") + return + + logger.info(f"Scan [{domain}]: Phase 2 — HEAD checking {len(all_urls)} URLs in batches of {_HEAD_BATCH_SIZE}") + unchanged, needs_crawl, had_headers = await _bulk_head_check(all_urls, site_store) + + if unchanged: + await site_store.touch_crawled_at(unchanged) + logger.info( + f"Scan [{domain}]: HEAD check complete — {len(unchanged)} unchanged, {len(needs_crawl)} need crawling" + ) + + # Phase 3: Crawl changed URLs in batches crawled_total = 0 - skipped_total = 0 - while True: - batch = site_store.get_urls_needing_recrawl(limit=CRAWL_BATCH_SIZE, crawled_before=scan_start) - if not batch: - break - - # Pre-flight: skip URLs unchanged since last crawl (304) - had_headers = site_store.get_cache_headers(batch) - unchanged, to_crawl = await _head_check(batch, site_store) - if unchanged: - site_store.touch_crawled_at(unchanged) - skipped_total += len(unchanged) - - if not to_crawl: - continue + homepage_title: str | None = None + homepage_description: str | None = None + for i in range(0, len(needs_crawl), CRAWL_BATCH_SIZE): + batch = needs_crawl[i : i + CRAWL_BATCH_SIZE] logger.info( - f"Scan [{domain}]: Phase 2 — crawling {len(to_crawl)} URLs " - f"(skipped {len(unchanged)}, total so far: {crawled_total})" + f"Scan [{domain}]: Phase 3 — crawling batch {i // CRAWL_BATCH_SIZE + 1} " + f"({len(batch)} URLs, total so far: {crawled_total})" ) - results = await crawler_service.crawl_urls(urls=to_crawl) + results = await crawler_service.crawl_urls(urls=batch) succeeded_urls = {p["url"] for p in results} - failed_urls = [u for u in to_crawl if u not in succeeded_urls] + failed_urls = [u for u in batch if u not in succeeded_urls] updates = [ { @@ -211,24 +268,55 @@ async def _scan_website( } for p in results ] - site_store.update_content_hashes(updates) + await site_store.update_content_hashes(updates) crawled_total += len(updates) + if homepage_title is None: + for p in results: + if _is_homepage(p["url"], domain): + homepage_title = p.get("title") + sd = p.get("structured_data") + if isinstance(sd, str): + sd = json.loads(sd) + homepage_description = _extract_meta_description(sd) + break + + if indexing_service: + for p in results: + if p.get("content"): + try: + await indexing_service.index_page( + domain=domain, + url=p["url"], + title=p.get("title"), + content=p["content"], + ) + except Exception: + logger.exception(f"Indexing failed for {p['url']}") + if failed_urls: logger.warning(f"Scan [{domain}]: {len(failed_urls)} URLs failed in batch") - site_store.increment_fail_count(failed_urls) + await site_store.increment_fail_count(failed_urls) - # Seed cache headers for URLs that had none before first_time = [u for u in succeeded_urls if u not in had_headers] if first_time: await _seed_cache_headers(first_time, site_store) - logger.info(f"Scan [{domain}]: crawled {crawled_total}, skipped {skipped_total} unchanged URLs") + logger.info(f"Scan [{domain}]: crawled {crawled_total}, skipped {len(unchanged)} unchanged URLs") + + # Phase 4: Update website metadata + page_count = await site_store.get_total_count() + await store_manager.update_website_metadata( + domain=domain, + title=homepage_title, + description=homepage_description, + page_count=page_count, + ) - store_manager.update_last_scanned(domain) - store_manager.update_scan_status(domain, "idle") - logger.info(f"Scan [{domain}]: complete") + await store_manager.update_last_scanned(domain) + await store_manager.update_scan_status(domain, "active") + logger.info(f"Scan [{domain}]: complete (pages={page_count})") except Exception as e: logger.exception(f"Scan failed for {domain}") - store_manager.update_scan_status(domain, "error", str(e)) + await store_manager.update_scan_status(domain, "error", str(e)) diff --git a/services/crawler/app/services/search_service.py b/services/crawler/app/services/search_service.py new file mode 100644 index 0000000000..67356e173f --- /dev/null +++ b/services/crawler/app/services/search_service.py @@ -0,0 +1,130 @@ +""" +Hybrid search service: BM25 full-text (pg_search) + pgvector similarity with RRF fusion. +""" + +import asyncio +import json +import logging +from dataclasses import dataclass + +import asyncpg + +from app.services.embedding_service import EmbeddingService + +logger = logging.getLogger(__name__) + +RRF_K = 60 + + +@dataclass +class SearchResult: + url: str + title: str | None + chunk_content: str + chunk_index: int + score: float + + +class SearchService: + def __init__(self, pool: asyncpg.Pool, embedding_service: EmbeddingService): + self._pool = pool + self._embedding = embedding_service + + async def search( + self, + query: str, + domain: str | None = None, + limit: int = 10, + ) -> list[SearchResult]: + # Generate query embedding and run both searches in parallel + embedding_task = asyncio.create_task(self._embedding.embed_query(query)) + fts_task = asyncio.create_task(self._fts_search(query, domain, limit * 3)) + + query_embedding = await embedding_task + fts_results = await fts_task + vector_results = await self._vector_search(query_embedding, domain, limit * 3) + + return self._merge_rrf([fts_results, vector_results], limit) + + async def _fts_search(self, query: str, domain: str | None, limit: int) -> list[dict]: + async with self._pool.acquire() as conn: + if domain: + rows = await conn.fetch( + """SELECT id, url, title, chunk_content, chunk_index, + paradedb.score(id) AS score + FROM chunks + WHERE chunk_content @@@ $1 AND domain = $2 + ORDER BY score DESC + LIMIT $3""", + query, + domain, + limit, + ) + else: + rows = await conn.fetch( + """SELECT id, url, title, chunk_content, chunk_index, + paradedb.score(id) AS score + FROM chunks + WHERE chunk_content @@@ $1 + ORDER BY score DESC + LIMIT $2""", + query, + limit, + ) + return [dict(r) for r in rows] + + async def _vector_search(self, embedding: list[float], domain: str | None, limit: int) -> list[dict]: + vec_str = json.dumps(embedding) + async with self._pool.acquire() as conn: + if domain: + rows = await conn.fetch( + """SELECT id, url, title, chunk_content, chunk_index, + 1 - (embedding <=> $1::vector) AS score + FROM chunks + WHERE domain = $2 AND embedding IS NOT NULL + ORDER BY embedding <=> $1::vector + LIMIT $3""", + vec_str, + domain, + limit, + ) + else: + rows = await conn.fetch( + """SELECT id, url, title, chunk_content, chunk_index, + 1 - (embedding <=> $1::vector) AS score + FROM chunks + WHERE embedding IS NOT NULL + ORDER BY embedding <=> $1::vector + LIMIT $2""", + vec_str, + limit, + ) + return [dict(r) for r in rows] + + @staticmethod + def _merge_rrf(ranked_lists: list[list[dict]], limit: int) -> list[SearchResult]: + scores: dict[int, float] = {} + items: dict[int, dict] = {} + + for ranked in ranked_lists: + for rank, item in enumerate(ranked): + item_id = item["id"] + rrf_score = 1.0 / (RRF_K + rank + 1) + scores[item_id] = scores.get(item_id, 0.0) + rrf_score + items[item_id] = item + + sorted_ids = sorted(scores, key=lambda k: scores[k], reverse=True)[:limit] + + # Normalize scores + max_score = scores[sorted_ids[0]] if sorted_ids else 1.0 + + return [ + SearchResult( + url=items[item_id]["url"], + title=items[item_id].get("title"), + chunk_content=items[item_id]["chunk_content"], + chunk_index=items[item_id]["chunk_index"], + score=scores[item_id] / max_score, + ) + for item_id in sorted_ids + ] diff --git a/services/crawler/app/services/website_store.py b/services/crawler/app/services/website_store.py deleted file mode 100644 index d090fbb7cd..0000000000 --- a/services/crawler/app/services/website_store.py +++ /dev/null @@ -1,451 +0,0 @@ -""" -Multi-DB SQLite store for website URL registry with content hashing. - -Architecture: -- Main DB (data/crawler.db): websites table — registry of all tracked websites -- Per-site DB (data/sites/{domain}.db): website_urls table — URLs + content_hash per site - -Benefits: -- Zero lock contention: each website has its own SQLite file, independent WAL -- Natural concurrency: different websites can be scanned in parallel -- Clean deletion: remove_website = close connection + unlink the .db file -""" - -import json -import logging -import sqlite3 -import time -from pathlib import Path -from urllib.parse import urlparse - -logger = logging.getLogger(__name__) - -_DEFAULT_DATA_DIR = Path(__file__).resolve().parent.parent.parent / "data" - - -def _sanitize_domain(domain: str) -> str: - return domain.replace(".", "_").replace("-", "_") - - -class WebsiteStore: - """Manages one per-site SQLite file with URL registry and content hashes.""" - - def __init__(self, db_path: Path): - self._db_path = db_path - self._db_path.parent.mkdir(parents=True, exist_ok=True) - self._conn: sqlite3.Connection | None = None - self._get_conn() - - def _get_conn(self) -> sqlite3.Connection: - if self._conn is None: - self._conn = sqlite3.connect(str(self._db_path), timeout=30) - self._conn.execute("PRAGMA journal_mode=WAL") - self._conn.execute("PRAGMA busy_timeout=5000") - self._conn.row_factory = sqlite3.Row - self._ensure_schema(self._conn) - return self._conn - - @staticmethod - def _ensure_schema(conn: sqlite3.Connection): - conn.executescript(""" - CREATE TABLE IF NOT EXISTS website_urls ( - url TEXT PRIMARY KEY, - content_hash TEXT, - status TEXT NOT NULL DEFAULT 'discovered', - last_crawled_at REAL, - discovered_at REAL NOT NULL, - title TEXT, - content TEXT, - word_count INTEGER, - metadata TEXT, - structured_data TEXT, - fail_count INTEGER NOT NULL DEFAULT 0 - ); - - CREATE INDEX IF NOT EXISTS idx_crawl_order - ON website_urls(last_crawled_at); - """) - - # Migrate existing databases: add content cache columns if missing - existing = {row[1] for row in conn.execute("PRAGMA table_info(website_urls)").fetchall()} - for col, col_type in [ - ("title", "TEXT"), - ("content", "TEXT"), - ("word_count", "INTEGER"), - ("metadata", "TEXT"), - ("structured_data", "TEXT"), - ("fail_count", "INTEGER NOT NULL DEFAULT 0"), - ("etag", "TEXT"), - ("last_modified", "TEXT"), - ]: - if col not in existing: - conn.execute(f"ALTER TABLE website_urls ADD COLUMN {col} {col_type}") - - conn.commit() - - def save_discovered_urls(self, urls: list[dict]) -> int: - if not urls: - return 0 - - now = time.time() - rows = [(u["url"], now) for u in urls] - conn = self._get_conn() - conn.executemany( - "INSERT OR IGNORE INTO website_urls (url, discovered_at) VALUES (?, ?)", - rows, - ) - inserted = conn.total_changes - conn.commit() - return inserted - - def get_urls_page(self, offset: int = 0, limit: int = 100, status: str | None = None) -> list[dict]: - conn = self._get_conn() - if status: - rows = conn.execute( - "SELECT url, content_hash, status, last_crawled_at " - "FROM website_urls WHERE content_hash IS NOT NULL AND status = ? " - "ORDER BY rowid LIMIT ? OFFSET ?", - (status, limit, offset), - ).fetchall() - else: - rows = conn.execute( - "SELECT url, content_hash, status, last_crawled_at " - "FROM website_urls WHERE content_hash IS NOT NULL " - "ORDER BY rowid LIMIT ? OFFSET ?", - (limit, offset), - ).fetchall() - - return [ - { - "url": r["url"], - "content_hash": r["content_hash"], - "status": r["status"], - "last_crawled_at": r["last_crawled_at"], - } - for r in rows - ] - - def get_urls_needing_recrawl(self, limit: int = 20, crawled_before: float | None = None) -> list[str]: - conn = self._get_conn() - if crawled_before is not None: - rows = conn.execute( - "SELECT url FROM website_urls " - "WHERE status != 'deleted' " - "AND (last_crawled_at IS NULL OR last_crawled_at < ?) " - "ORDER BY CASE WHEN content_hash IS NULL THEN 0 ELSE 1 END, " - "last_crawled_at ASC NULLS FIRST " - "LIMIT ?", - (crawled_before, limit), - ).fetchall() - else: - rows = conn.execute( - "SELECT url FROM website_urls " - "WHERE status != 'deleted' " - "ORDER BY CASE WHEN content_hash IS NULL THEN 0 ELSE 1 END, " - "last_crawled_at ASC NULLS FIRST " - "LIMIT ?", - (limit,), - ).fetchall() - return [r["url"] for r in rows] - - def increment_fail_count(self, urls: list[str]): - if not urls: - return - - now = time.time() - conn = self._get_conn() - conn.executemany( - "UPDATE website_urls SET fail_count = fail_count + 1, last_crawled_at = ? WHERE url = ?", - [(now, url) for url in urls], - ) - conn.commit() - - def update_content_hashes(self, updates: list[dict]): - if not updates: - return - - now = time.time() - conn = self._get_conn() - conn.executemany( - "UPDATE website_urls " - "SET content_hash = ?, status = ?, last_crawled_at = ?, " - " title = ?, content = ?, word_count = ?, metadata = ?, structured_data = ?, " - " fail_count = 0 " - "WHERE url = ?", - [ - ( - u["content_hash"], - u.get("status", "active"), - now, - u.get("title"), - u.get("content"), - u.get("word_count"), - u.get("metadata"), - u.get("structured_data"), - u["url"], - ) - for u in updates - ], - ) - conn.commit() - - def mark_urls_deleted(self, urls: list[str]): - if not urls: - return - - conn = self._get_conn() - conn.executemany( - "UPDATE website_urls SET status = 'deleted' WHERE url = ?", - [(url,) for url in urls], - ) - conn.commit() - - def get_cache_headers(self, urls: list[str]) -> dict[str, dict]: - """Load stored etag/last_modified for URLs that have at least one header.""" - if not urls: - return {} - - conn = self._get_conn() - placeholders = ",".join("?" * len(urls)) - rows = conn.execute( - "SELECT url, etag, last_modified FROM website_urls " - f"WHERE url IN ({placeholders}) AND (etag IS NOT NULL OR last_modified IS NOT NULL)", - urls, - ).fetchall() - - return {r["url"]: {"etag": r["etag"], "last_modified": r["last_modified"]} for r in rows} - - def update_cache_headers(self, updates: list[dict]): - """Batch store etag/last_modified from HEAD responses.""" - if not updates: - return - - conn = self._get_conn() - conn.executemany( - "UPDATE website_urls SET etag = ?, last_modified = ? WHERE url = ?", - [(u.get("etag"), u.get("last_modified"), u["url"]) for u in updates], - ) - conn.commit() - - def touch_crawled_at(self, urls: list[str]): - """Update only last_crawled_at for unchanged URLs (skipped by 304).""" - if not urls: - return - - now = time.time() - conn = self._get_conn() - conn.executemany( - "UPDATE website_urls SET last_crawled_at = ? WHERE url = ?", - [(now, url) for url in urls], - ) - conn.commit() - - def get_total_count(self, status: str | None = None) -> int: - conn = self._get_conn() - if status: - row = conn.execute( - "SELECT COUNT(*) as cnt FROM website_urls WHERE content_hash IS NOT NULL AND status = ?", - (status,), - ).fetchone() - else: - row = conn.execute("SELECT COUNT(*) as cnt FROM website_urls WHERE content_hash IS NOT NULL").fetchone() - return row["cnt"] if row else 0 - - def get_cached_pages(self, urls: list[str]) -> list[dict]: - if not urls: - return [] - - conn = self._get_conn() - placeholders = ",".join("?" * len(urls)) - rows = conn.execute( - "SELECT url, title, content, word_count, metadata, structured_data " - f"FROM website_urls WHERE url IN ({placeholders}) AND content IS NOT NULL", - urls, - ).fetchall() - - return [ - { - "url": r["url"], - "title": r["title"], - "content": r["content"], - "word_count": r["word_count"] or 0, - "metadata": json.loads(r["metadata"]) if r["metadata"] else None, - "structured_data": json.loads(r["structured_data"]) if r["structured_data"] else None, - } - for r in rows - ] - - def close(self): - if self._conn: - self._conn.close() - self._conn = None - - -class WebsiteStoreManager: - """Manages main DB (website registry) + per-site WebsiteStore instances.""" - - def __init__(self, data_dir: Path | None = None): - self._data_dir = data_dir or _DEFAULT_DATA_DIR - self._data_dir.mkdir(parents=True, exist_ok=True) - self._main_db_path = self._data_dir / "crawler.db" - self._sites_dir = self._data_dir / "sites" - self._sites_dir.mkdir(parents=True, exist_ok=True) - self._stores: dict[str, WebsiteStore] = {} - self._main_conn: sqlite3.Connection | None = None - self._init_main_db() - - def _get_main_conn(self) -> sqlite3.Connection: - if self._main_conn is None: - self._main_conn = sqlite3.connect(str(self._main_db_path), timeout=30) - self._main_conn.execute("PRAGMA journal_mode=WAL") - self._main_conn.execute("PRAGMA busy_timeout=5000") - self._main_conn.row_factory = sqlite3.Row - return self._main_conn - - def _init_main_db(self): - conn = self._get_main_conn() - conn.executescript(""" - CREATE TABLE IF NOT EXISTS websites ( - domain TEXT PRIMARY KEY, - status TEXT NOT NULL DEFAULT 'idle', - scan_interval INTEGER NOT NULL DEFAULT 21600, - last_scanned_at REAL, - error TEXT, - created_at REAL NOT NULL, - updated_at REAL NOT NULL - ); - """) - conn.commit() - logger.info(f"Website store manager initialized at {self._data_dir}") - - def register_website(self, domain: str, scan_interval: int = 21600) -> dict: - now = time.time() - conn = self._get_main_conn() - conn.execute( - """INSERT INTO websites (domain, scan_interval, created_at, updated_at) - VALUES (?, ?, ?, ?) - ON CONFLICT(domain) DO UPDATE SET - scan_interval = excluded.scan_interval, - updated_at = excluded.updated_at""", - (domain, scan_interval, now, now), - ) - conn.commit() - logger.info(f"Registered website: {domain} (interval={scan_interval}s)") - return {"domain": domain, "scan_interval": scan_interval, "status": "idle"} - - def remove_website(self, domain: str) -> bool: - if domain in self._stores: - self._stores[domain].close() - del self._stores[domain] - - db_file = self._sites_dir / f"{_sanitize_domain(domain)}.db" - if db_file.exists(): - db_file.unlink() - wal = db_file.with_suffix(".db-wal") - shm = db_file.with_suffix(".db-shm") - if wal.exists(): - wal.unlink() - if shm.exists(): - shm.unlink() - - conn = self._get_main_conn() - cursor = conn.execute("DELETE FROM websites WHERE domain = ?", (domain,)) - conn.commit() - deleted = cursor.rowcount > 0 - if deleted: - logger.info(f"Removed website: {domain}") - return deleted - - def get_due_websites(self) -> list[dict]: - now = time.time() - conn = self._get_main_conn() - rows = conn.execute( - """SELECT domain, status, scan_interval, last_scanned_at, error - FROM websites - WHERE status != 'scanning' - AND (last_scanned_at IS NULL - OR last_scanned_at + scan_interval < ?)""", - (now,), - ).fetchall() - return [dict(r) for r in rows] - - def update_scan_status(self, domain: str, status: str, error: str | None = None): - now = time.time() - conn = self._get_main_conn() - conn.execute( - "UPDATE websites SET status = ?, error = ?, updated_at = ? WHERE domain = ?", - (status, error, now, domain), - ) - conn.commit() - - def update_last_scanned(self, domain: str): - now = time.time() - conn = self._get_main_conn() - conn.execute( - "UPDATE websites SET last_scanned_at = ?, updated_at = ? WHERE domain = ?", - (now, now, domain), - ) - conn.commit() - - def get_website(self, domain: str) -> dict | None: - conn = self._get_main_conn() - row = conn.execute( - "SELECT domain, status, scan_interval, last_scanned_at, error, created_at, updated_at " - "FROM websites WHERE domain = ?", - (domain,), - ).fetchone() - return dict(row) if row else None - - def get_site_store(self, domain: str) -> WebsiteStore: - if domain not in self._stores: - db_path = self._sites_dir / f"{_sanitize_domain(domain)}.db" - self._stores[domain] = WebsiteStore(db_path) - return self._stores[domain] - - def get_cached_pages(self, urls: list[str]) -> tuple[list[dict], list[str]]: - """Return cached page content for URLs with registered websites. - - Returns (cached_pages, urls_needing_crawl). - """ - if not urls: - return [], [] - - by_domain: dict[str, list[str]] = {} - for url in urls: - domain = urlparse(url).netloc - by_domain.setdefault(domain, []).append(url) - - cached: list[dict] = [] - to_crawl: list[str] = [] - - for domain, domain_urls in by_domain.items(): - if not self.get_website(domain): - to_crawl.extend(domain_urls) - continue - - site_store = self.get_site_store(domain) - hits = site_store.get_cached_pages(domain_urls) - hit_urls = {p["url"] for p in hits} - cached.extend(hits) - to_crawl.extend(u for u in domain_urls if u not in hit_urls) - - return cached, to_crawl - - def close_all(self): - for store in self._stores.values(): - store.close() - self._stores.clear() - if self._main_conn: - self._main_conn.close() - self._main_conn = None - logger.info("All website stores closed") - - -_store_manager: WebsiteStoreManager | None = None - - -def get_website_store_manager() -> WebsiteStoreManager: - global _store_manager - if _store_manager is None: - _store_manager = WebsiteStoreManager() - return _store_manager diff --git a/services/crawler/pyproject.toml b/services/crawler/pyproject.toml index 182ee6cbde..3ea23329c5 100644 --- a/services/crawler/pyproject.toml +++ b/services/crawler/pyproject.toml @@ -18,6 +18,8 @@ dependencies = [ "python-docx==1.2.0", "pymupdf==1.27.1", "openai>=1.0.0", + "asyncpg>=0.30.0", + "tiktoken>=0.9.0", ] [project.optional-dependencies] diff --git a/services/crawler/tests/test_chunking_service.py b/services/crawler/tests/test_chunking_service.py new file mode 100644 index 0000000000..2de3307ae4 --- /dev/null +++ b/services/crawler/tests/test_chunking_service.py @@ -0,0 +1,251 @@ +from app.services.chunking_service import ( + CHUNK_OVERLAP, + CHUNK_SIZE, + MIN_CHUNK_LENGTH, + ContentChunk, + _split_sentences, + chunk_content, +) + + +class TestChunkContentEmptyInput: + def test_empty_string(self): + assert chunk_content("") == [] + + def test_none_like_empty(self): + assert chunk_content("") == [] + + def test_whitespace_only(self): + assert chunk_content(" \n\n \t ") == [] + + def test_newlines_only(self): + assert chunk_content("\n\n\n") == [] + + +class TestChunkContentSingleChunk: + def test_short_content_returns_one_chunk(self): + text = "Hello world, this is a test of the chunking service module." + result = chunk_content(text) + assert len(result) == 1 + assert result[0].content == text + assert result[0].index == 0 + + def test_content_is_stripped(self): + text = "Hello world, this is a test of the chunking service module." + result = chunk_content(f" {text} \n\n") + assert result[0].content == text + + def test_returns_content_chunk_dataclass(self): + text = "Hello world, this is a test of the chunking service module." + result = chunk_content(text) + assert isinstance(result[0], ContentChunk) + + +class TestChunkContentWithTitle: + BODY = "Some body text here that is long enough to pass the minimum chunk length filter." + + def test_title_prepended_to_single_chunk(self): + result = chunk_content(self.BODY, title="My Title") + assert result[0].content.startswith("My Title\n\n") + assert self.BODY in result[0].content + + def test_none_title_ignored(self): + result = chunk_content(self.BODY, title=None) + assert result[0].content == self.BODY + + def test_empty_title_ignored(self): + result = chunk_content(self.BODY, title="") + assert result[0].content == self.BODY + + def test_whitespace_title_ignored(self): + result = chunk_content(self.BODY, title=" ") + assert result[0].content == self.BODY + + def test_title_is_stripped(self): + result = chunk_content(self.BODY, title=" My Title ") + assert result[0].content.startswith("My Title\n\n") + + def test_title_prepended_to_every_chunk(self): + para = "A" * 100 + content = f"{para}\n\n{para}\n\n{para}" + result = chunk_content(content, title="Title", chunk_size=150, chunk_overlap=20) + for chunk in result: + assert chunk.content.startswith("Title") + + +class TestChunkContentMultipleParagraphs: + def test_two_paragraphs_within_limit_stay_in_one_chunk(self): + p1 = "First paragraph with enough content to be meaningful here." + p2 = "Second paragraph also with enough content to pass filters." + content = f"{p1}\n\n{p2}" + result = chunk_content(content, chunk_size=500) + assert len(result) == 1 + assert p1 in result[0].content + assert p2 in result[0].content + + def test_paragraphs_exceeding_limit_split_into_multiple_chunks(self): + p1 = "A" * 100 + p2 = "B" * 100 + p3 = "C" * 100 + content = f"{p1}\n\n{p2}\n\n{p3}" + result = chunk_content(content, chunk_size=150, chunk_overlap=20) + assert len(result) > 1 + + def test_paragraph_boundaries_preserved(self): + p1 = "First paragraph with enough content to pass the minimum length." + p2 = "Second paragraph also with enough content to pass the filter." + content = f"{p1}\n\n{p2}" + result = chunk_content(content, chunk_size=500) + assert "\n\n" in result[0].content + + +class TestChunkContentOverlap: + def test_chunks_share_overlapping_text(self): + p1 = "A" * 150 + p2 = "B" * 150 + result = chunk_content(f"{p1}\n\n{p2}", chunk_size=200, chunk_overlap=50) + assert len(result) >= 2 + tail_of_first = result[0].content[-50:] + assert tail_of_first in result[1].content + + def test_overlap_smaller_than_content_uses_tail(self): + p1 = "X" * 200 + p2 = "Y" * 200 + result = chunk_content(f"{p1}\n\n{p2}", chunk_size=250, chunk_overlap=30) + assert len(result) >= 2 + overlap_region = result[0].content[-30:] + assert overlap_region in result[1].content + + +class TestChunkContentLargeParagraphSentenceSplitting: + def test_large_paragraph_splits_by_sentence(self): + sentences = [f"This is sentence number {i}." for i in range(50)] + large_para = " ".join(sentences) + result = chunk_content(large_para, chunk_size=200, chunk_overlap=30) + assert len(result) > 1 + for chunk in result: + assert len(chunk.content) <= 200 + 50 + + def test_sentences_distributed_across_chunks(self): + sentences = [f"This is a fairly long sentence number {i} here." for i in range(30)] + large_para = " ".join(sentences) + result = chunk_content(large_para, chunk_size=200, chunk_overlap=20) + combined = " ".join(c.content for c in result) + for s in sentences: + assert s in combined + + +class TestChunkContentHardSplit: + def test_very_long_sentence_hard_split(self): + long_sentence = "A" * 5000 + result = chunk_content(long_sentence, chunk_size=500, chunk_overlap=50) + assert len(result) > 1 + for chunk in result: + assert len(chunk.content) <= 500 + 50 + + def test_hard_split_pieces_cover_original(self): + long_sentence = "B" * 3000 + result = chunk_content(long_sentence, chunk_size=500, chunk_overlap=50) + combined = "".join(c.content for c in result) + assert "B" * 500 in combined + + +class TestChunkContentMinChunkLength: + def test_short_content_below_min_filtered_out(self): + result = chunk_content("Hi.", min_chunk_length=100) + assert result == [] + + def test_content_at_min_length_kept(self): + text = "A" * MIN_CHUNK_LENGTH + result = chunk_content(text) + assert len(result) == 1 + + def test_content_just_below_min_length_filtered(self): + text = "A" * (MIN_CHUNK_LENGTH - 1) + result = chunk_content(text) + assert result == [] + + def test_custom_min_chunk_length(self): + result = chunk_content("Short text.", min_chunk_length=5) + assert len(result) == 1 + + def test_custom_high_min_chunk_length_filters(self): + result = chunk_content("Short text.", min_chunk_length=500) + assert result == [] + + +class TestChunkContentCustomParams: + def test_custom_chunk_size(self): + content = "Word " * 200 + result_small = chunk_content(content, chunk_size=100, chunk_overlap=10) + result_large = chunk_content(content, chunk_size=2000, chunk_overlap=10) + assert len(result_small) > len(result_large) + + def test_custom_overlap(self): + p1 = "A" * 200 + p2 = "B" * 200 + content = f"{p1}\n\n{p2}" + result_small_overlap = chunk_content(content, chunk_size=250, chunk_overlap=10) + result_large_overlap = chunk_content(content, chunk_size=250, chunk_overlap=100) + assert len(result_small_overlap) >= 2 + assert len(result_large_overlap) >= 2 + tail_10 = result_small_overlap[0].content[-10:] + tail_100 = result_large_overlap[0].content[-100:] + assert tail_10 in result_small_overlap[1].content + assert tail_100 in result_large_overlap[1].content + + def test_defaults_match_constants(self): + assert CHUNK_SIZE == 2048 + assert CHUNK_OVERLAP == 200 + assert MIN_CHUNK_LENGTH == 50 + + +class TestChunkContentIndexNumbering: + def test_single_chunk_has_index_zero(self): + text = "Hello world, this is a test of the chunking service module." + result = chunk_content(text) + assert result[0].index == 0 + + def test_multiple_chunks_have_sequential_indexes(self): + paragraphs = [("P" * 100) for _ in range(10)] + content = "\n\n".join(paragraphs) + result = chunk_content(content, chunk_size=150, chunk_overlap=20) + assert len(result) > 1 + for i, chunk in enumerate(result): + assert chunk.index == i + + def test_indexes_are_contiguous(self): + long_sentence = "X" * 3000 + result = chunk_content(long_sentence, chunk_size=300, chunk_overlap=30) + indexes = [c.index for c in result] + assert indexes == list(range(len(result))) + + +class TestSplitSentences: + def test_basic_sentence_splitting(self): + result = _split_sentences("Hello world. How are you?") + assert result == ["Hello world.", "How are you?"] + + def test_abbreviation_dr(self): + result = _split_sentences("Dr. Smith is here. He is good.") + assert result == ["Dr. Smith is here.", "He is good."] + + def test_abbreviation_inc(self): + result = _split_sentences("Apple Inc. reported earnings. Revenue grew.") + assert result == ["Apple Inc. reported earnings.", "Revenue grew."] + + def test_exclamation_and_question_marks(self): + result = _split_sentences("What happened! Tell me. Now!") + assert result == ["What happened!", "Tell me.", "Now!"] + + def test_single_sentence(self): + result = _split_sentences("Just one sentence.") + assert result == ["Just one sentence."] + + def test_no_split_when_no_capital_after_period(self): + result = _split_sentences("count is 3.5 million total") + assert result == ["count is 3.5 million total"] + + def test_multiple_abbreviations(self): + result = _split_sentences("Mr. and Mrs. Smith went out. They had fun.") + assert result == ["Mr. and Mrs. Smith went out.", "They had fun."] diff --git a/services/crawler/tests/test_embedding_service.py b/services/crawler/tests/test_embedding_service.py new file mode 100644 index 0000000000..8d4f6a004e --- /dev/null +++ b/services/crawler/tests/test_embedding_service.py @@ -0,0 +1,163 @@ +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from app.services.embedding_service import EmbeddingService + + +def make_embedding_response(embeddings: list[list[float]]): + return SimpleNamespace(data=[SimpleNamespace(embedding=e) for e in embeddings]) + + +def create_service(dimensions: int = 1536) -> EmbeddingService: + service = EmbeddingService( + api_key="test-key", + base_url=None, + model="test-model", + dimensions=dimensions, + ) + mock_client = MagicMock() + mock_client.embeddings.create = AsyncMock() + service._client = mock_client + return service + + +class TestDimensionsProperty: + def test_returns_configured_dimensions(self): + service = create_service(dimensions=768) + assert service.dimensions == 768 + + def test_returns_default_dimensions(self): + service = create_service() + assert service.dimensions == 1536 + + +class TestEmbedTexts: + async def test_empty_texts_returns_empty_list(self): + service = create_service() + result = await service.embed_texts([]) + assert result == [] + service._client.embeddings.create.assert_not_called() + + async def test_single_text(self): + service = create_service(dimensions=3) + expected = [0.1, 0.2, 0.3] + service._client.embeddings.create.return_value = make_embedding_response([expected]) + + result = await service.embed_texts(["hello"]) + + assert result == [expected] + service._client.embeddings.create.assert_called_once_with( + model="test-model", + input=["hello"], + dimensions=3, + ) + + async def test_multiple_texts_single_batch(self): + service = create_service(dimensions=2) + embeddings = [[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]] + service._client.embeddings.create.return_value = make_embedding_response(embeddings) + + result = await service.embed_texts(["a", "b", "c"]) + + assert result == embeddings + service._client.embeddings.create.assert_called_once_with( + model="test-model", + input=["a", "b", "c"], + dimensions=2, + ) + + async def test_batching_splits_large_input(self, monkeypatch): + import app.services.embedding_service as module + + monkeypatch.setattr(module, "MAX_BATCH_SIZE", 2) + + service = create_service(dimensions=2) + batch1_embeddings = [[0.1, 0.2], [0.3, 0.4]] + batch2_embeddings = [[0.5, 0.6]] + service._client.embeddings.create.side_effect = [ + make_embedding_response(batch1_embeddings), + make_embedding_response(batch2_embeddings), + ] + + result = await service.embed_texts(["a", "b", "c"]) + + assert result == [[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]] + assert service._client.embeddings.create.call_count == 2 + calls = service._client.embeddings.create.call_args_list + assert calls[0].kwargs == {"model": "test-model", "input": ["a", "b"], "dimensions": 2} + assert calls[1].kwargs == {"model": "test-model", "input": ["c"], "dimensions": 2} + + async def test_batching_exact_multiple(self, monkeypatch): + import app.services.embedding_service as module + + monkeypatch.setattr(module, "MAX_BATCH_SIZE", 2) + + service = create_service(dimensions=1) + service._client.embeddings.create.side_effect = [ + make_embedding_response([[1.0], [2.0]]), + make_embedding_response([[3.0], [4.0]]), + ] + + result = await service.embed_texts(["a", "b", "c", "d"]) + + assert result == [[1.0], [2.0], [3.0], [4.0]] + assert service._client.embeddings.create.call_count == 2 + + +class TestEmbedQuery: + async def test_returns_single_vector(self): + service = create_service(dimensions=3) + expected = [0.1, 0.2, 0.3] + service._client.embeddings.create.return_value = make_embedding_response([expected]) + + result = await service.embed_query("search term") + + assert result == expected + service._client.embeddings.create.assert_called_once_with( + model="test-model", + input=["search term"], + dimensions=3, + ) + + +class TestRetryBehavior: + @patch("app.services.embedding_service.asyncio.sleep", new_callable=AsyncMock) + async def test_retries_on_first_failure(self, mock_sleep): + service = create_service(dimensions=2) + expected = [[0.1, 0.2]] + service._client.embeddings.create.side_effect = [ + RuntimeError("API error"), + make_embedding_response(expected), + ] + + result = await service.embed_texts(["hello"]) + + assert result == expected + assert service._client.embeddings.create.call_count == 2 + mock_sleep.assert_awaited_once() + + @patch("app.services.embedding_service.asyncio.sleep", new_callable=AsyncMock) + async def test_raises_on_second_failure(self, mock_sleep): + service = create_service(dimensions=2) + service._client.embeddings.create.side_effect = [ + RuntimeError("API error"), + RuntimeError("API error again"), + ] + + with pytest.raises(RuntimeError, match="API error again"): + await service.embed_texts(["hello"]) + + assert service._client.embeddings.create.call_count == 2 + mock_sleep.assert_awaited_once() + + async def test_no_retry_on_success(self): + service = create_service(dimensions=2) + expected = [[0.1, 0.2]] + service._client.embeddings.create.return_value = make_embedding_response(expected) + + result = await service.embed_texts(["hello"]) + + assert result == expected + assert service._client.embeddings.create.call_count == 1 diff --git a/services/crawler/tests/test_index_router.py b/services/crawler/tests/test_index_router.py new file mode 100644 index 0000000000..b21cbcd586 --- /dev/null +++ b/services/crawler/tests/test_index_router.py @@ -0,0 +1,134 @@ +from unittest.mock import AsyncMock + +import pytest +from fastapi import FastAPI +from httpx import ASGITransport, AsyncClient + +from app.routers.index import router, set_indexing_service + +app = FastAPI() +app.include_router(router) + + +@pytest.fixture +def mock_indexing_service(): + service = AsyncMock() + set_indexing_service(service) + yield service + set_indexing_service(None) + + +class TestIndexPage: + async def test_success(self, mock_indexing_service): + mock_indexing_service.index_page.return_value = { + "url": "https://example.com/page", + "status": "indexed", + "chunks_indexed": 5, + } + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post( + "/api/v1/index/page", + json={ + "domain": "example.com", + "url": "https://example.com/page", + "title": "Test Page", + "content": "Some content to index", + }, + ) + + assert response.status_code == 200 + data = response.json() + assert data["success"] is True + assert data["url"] == "https://example.com/page" + assert data["chunks_indexed"] == 5 + assert data["status"] == "indexed" + mock_indexing_service.index_page.assert_awaited_once_with( + domain="example.com", + url="https://example.com/page", + title="Test Page", + content="Some content to index", + ) + + async def test_skipped_page(self, mock_indexing_service): + mock_indexing_service.index_page.return_value = { + "url": "https://example.com/page", + "status": "skipped", + "chunks_indexed": 0, + } + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post( + "/api/v1/index/page", + json={"domain": "example.com", "url": "https://example.com/page", "content": "Same content"}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["success"] is True + assert data["status"] == "skipped" + + async def test_503_when_service_not_initialized(self): + set_indexing_service(None) + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post( + "/api/v1/index/page", + json={"domain": "example.com", "url": "https://example.com/page", "content": "content"}, + ) + + assert response.status_code == 503 + assert response.json()["detail"] == "Indexing service not initialized" + + async def test_500_on_unexpected_error(self, mock_indexing_service): + mock_indexing_service.index_page.side_effect = RuntimeError("db error") + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post( + "/api/v1/index/page", + json={"domain": "example.com", "url": "https://example.com/page", "content": "content"}, + ) + + assert response.status_code == 500 + assert response.json()["detail"] == "Indexing failed" + + +class TestIndexWebsite: + async def test_success(self, mock_indexing_service): + mock_indexing_service.index_website.return_value = { + "domain": "example.com", + "pages_indexed": 10, + "pages_skipped": 2, + "pages_failed": 1, + "total_chunks": 50, + } + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/api/v1/index/website/example.com") + + assert response.status_code == 200 + data = response.json() + assert data["success"] is True + assert data["domain"] == "example.com" + assert data["pages_indexed"] == 10 + assert data["pages_skipped"] == 2 + assert data["pages_failed"] == 1 + assert data["total_chunks"] == 50 + mock_indexing_service.index_website.assert_awaited_once_with("example.com") + + async def test_503_when_service_not_initialized(self): + set_indexing_service(None) + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/api/v1/index/website/example.com") + + assert response.status_code == 503 + + async def test_500_on_unexpected_error(self, mock_indexing_service): + mock_indexing_service.index_website.side_effect = RuntimeError("boom") + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/api/v1/index/website/example.com") + + assert response.status_code == 500 + assert response.json()["detail"] == "Website indexing failed" diff --git a/services/crawler/tests/test_indexing_service.py b/services/crawler/tests/test_indexing_service.py new file mode 100644 index 0000000000..bee270cd9a --- /dev/null +++ b/services/crawler/tests/test_indexing_service.py @@ -0,0 +1,166 @@ +import hashlib +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from app.services.chunking_service import ContentChunk +from app.services.indexing_service import IndexingService + + +def _sha256(content: str) -> str: + return hashlib.sha256(content.encode()).hexdigest() + + +@pytest.fixture +def mock_conn(): + conn = AsyncMock() + conn.fetchval = AsyncMock(return_value=None) + conn.fetch = AsyncMock(return_value=[]) + conn.execute = AsyncMock(return_value="DELETE 0") + conn.executemany = AsyncMock() + conn.transaction = MagicMock(return_value=AsyncMock(__aenter__=AsyncMock(), __aexit__=AsyncMock())) + return conn + + +@pytest.fixture +def mock_pool(mock_conn): + pool = AsyncMock() + ctx = AsyncMock() + ctx.__aenter__ = AsyncMock(return_value=mock_conn) + ctx.__aexit__ = AsyncMock(return_value=None) + pool.acquire = MagicMock(return_value=ctx) + return pool + + +@pytest.fixture +def mock_embedding(): + service = AsyncMock() + service.embed_texts = AsyncMock(return_value=[[0.1, 0.2], [0.3, 0.4]]) + return service + + +@pytest.fixture +def indexing_service(mock_pool, mock_embedding): + return IndexingService(mock_pool, mock_embedding) + + +class TestIndexPage: + async def test_skips_when_content_hash_matches(self, mock_conn, indexing_service): + content = "some page content" + mock_conn.fetchval = AsyncMock(return_value=_sha256(content)) + + result = await indexing_service.index_page("example.com", "https://example.com/page", "Title", content) + + assert result["status"] == "skipped" + assert result["chunks_indexed"] == 0 + + @patch("app.services.indexing_service.chunk_content", return_value=[]) + async def test_returns_empty_when_no_chunks(self, mock_chunk, indexing_service): + result = await indexing_service.index_page("example.com", "https://example.com/page", "Title", "content") + + assert result["status"] == "empty" + assert result["chunks_indexed"] == 0 + + @patch("app.services.indexing_service.chunk_content") + async def test_returns_error_when_embedding_fails(self, mock_chunk, indexing_service, mock_embedding): + mock_chunk.return_value = [ContentChunk(content="chunk text", index=0)] + mock_embedding.embed_texts = AsyncMock(side_effect=RuntimeError("API down")) + + result = await indexing_service.index_page("example.com", "https://example.com/page", "Title", "content") + + assert result["status"] == "error" + assert result["error"] == "embedding_failed" + assert result["chunks_indexed"] == 0 + + @patch("app.services.indexing_service.chunk_content") + async def test_indexes_successfully(self, mock_chunk, indexing_service, mock_embedding): + chunks = [ContentChunk(content="chunk one", index=0), ContentChunk(content="chunk two", index=1)] + mock_chunk.return_value = chunks + mock_embedding.embed_texts = AsyncMock(return_value=[[0.1, 0.2], [0.3, 0.4]]) + + result = await indexing_service.index_page("example.com", "https://example.com/page", "Title", "content") + + assert result["status"] == "indexed" + assert result["chunks_indexed"] == 2 + assert result["url"] == "https://example.com/page" + + @patch("app.services.indexing_service.chunk_content") + async def test_deletes_old_chunks_before_inserting(self, mock_chunk, indexing_service, mock_conn, mock_embedding): + chunks = [ContentChunk(content="chunk", index=0)] + mock_chunk.return_value = chunks + mock_embedding.embed_texts = AsyncMock(return_value=[[0.1, 0.2]]) + + await indexing_service.index_page("example.com", "https://example.com/page", "Title", "content") + + calls = [str(c) for c in mock_conn.execute.call_args_list] + delete_call = next(c for c in calls if "DELETE" in c) + assert "https://example.com/page" in delete_call + mock_conn.executemany.assert_called_once() + + +class TestDeletePageChunks: + async def test_returns_deleted_count(self, indexing_service, mock_conn): + mock_conn.execute = AsyncMock(return_value="DELETE 5") + + count = await indexing_service.delete_page_chunks("https://example.com/page") + + assert count == 5 + + async def test_returns_zero_when_no_rows_deleted(self, indexing_service, mock_conn): + mock_conn.execute = AsyncMock(return_value="DELETE 0") + + count = await indexing_service.delete_page_chunks("https://example.com/page") + + assert count == 0 + + async def test_returns_zero_when_result_is_empty(self, indexing_service, mock_conn): + mock_conn.execute = AsyncMock(return_value="") + + count = await indexing_service.delete_page_chunks("https://example.com/page") + + assert count == 0 + + +class TestIndexWebsite: + async def test_aggregates_results_correctly(self, indexing_service, mock_conn): + mock_conn.fetch = AsyncMock( + return_value=[ + {"url": "https://example.com/a", "title": "Page A", "content": "aaa"}, + {"url": "https://example.com/b", "title": "Page B", "content": "bbb"}, + {"url": "https://example.com/c", "title": "Page C", "content": "ccc"}, + ] + ) + + call_count = 0 + results = [ + {"url": "https://example.com/a", "status": "indexed", "chunks_indexed": 3}, + {"url": "https://example.com/b", "status": "skipped", "chunks_indexed": 0}, + {"url": "https://example.com/c", "status": "error", "chunks_indexed": 0, "error": "embedding_failed"}, + ] + + async def fake_index_page(domain, url, title, content): + nonlocal call_count + result = results[call_count] + call_count += 1 + return result + + indexing_service.index_page = fake_index_page + + result = await indexing_service.index_website("example.com") + + assert result["domain"] == "example.com" + assert result["pages_indexed"] == 1 + assert result["pages_skipped"] == 1 + assert result["pages_failed"] == 1 + assert result["total_chunks"] == 3 + + async def test_returns_zeros_when_no_pages(self, indexing_service, mock_conn): + mock_conn.fetch = AsyncMock(return_value=[]) + + result = await indexing_service.index_website("empty.com") + + assert result["domain"] == "empty.com" + assert result["pages_indexed"] == 0 + assert result["pages_skipped"] == 0 + assert result["pages_failed"] == 0 + assert result["total_chunks"] == 0 diff --git a/services/crawler/tests/test_pages_router.py b/services/crawler/tests/test_pages_router.py new file mode 100644 index 0000000000..22dbd63963 --- /dev/null +++ b/services/crawler/tests/test_pages_router.py @@ -0,0 +1,204 @@ +from datetime import datetime, timezone +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from fastapi import FastAPI +from httpx import ASGITransport, AsyncClient + +from app.routers.pages import router + +app = FastAPI() +app.include_router(router) + +_DEFAULT_CRAWLED = datetime(2025, 6, 1, 12, 0, 0, tzinfo=timezone.utc) +_DEFAULT_DISCOVERED = datetime(2025, 5, 15, 8, 0, 0, tzinfo=timezone.utc) + + +class FakeRecord(dict): + """Dict subclass mimicking asyncpg Record with r["field"] access.""" + + +def _make_row(**overrides): + defaults = { + "url": "https://example.com/page1", + "title": "Page 1", + "word_count": 500, + "status": "active", + "content_hash": "abc123", + "last_crawled_at": _DEFAULT_CRAWLED, + "discovered_at": _DEFAULT_DISCOVERED, + "chunks_count": 3, + } + defaults.update(overrides) + return FakeRecord(defaults) + + +@pytest.fixture +def mock_pool(): + conn = AsyncMock() + pool = MagicMock() + ctx = AsyncMock() + ctx.__aenter__ = AsyncMock(return_value=conn) + ctx.__aexit__ = AsyncMock(return_value=False) + pool.acquire.return_value = ctx + with patch("app.routers.pages.get_pool", return_value=pool): + yield conn + + +class TestListPages: + async def test_success(self, mock_pool): + rows = [ + _make_row(url="https://example.com/a", title="Page A", word_count=100, chunks_count=2), + _make_row(url="https://example.com/b", title="Page B", word_count=200, chunks_count=0), + ] + mock_pool.fetch.return_value = rows + mock_pool.fetchval.return_value = 2 + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/pages/example.com") + + assert response.status_code == 200 + data = response.json() + assert data["domain"] == "example.com" + assert data["total"] == 2 + assert data["offset"] == 0 + assert data["has_more"] is False + assert len(data["pages"]) == 2 + + page_a = data["pages"][0] + assert page_a["url"] == "https://example.com/a" + assert page_a["title"] == "Page A" + assert page_a["word_count"] == 100 + assert page_a["chunks_count"] == 2 + assert page_a["indexed"] is True + + page_b = data["pages"][1] + assert page_b["url"] == "https://example.com/b" + assert page_b["chunks_count"] == 0 + assert page_b["indexed"] is False + + async def test_empty_result(self, mock_pool): + mock_pool.fetch.return_value = [] + mock_pool.fetchval.return_value = 0 + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/pages/unknown.com") + + assert response.status_code == 200 + data = response.json() + assert data["domain"] == "unknown.com" + assert data["pages"] == [] + assert data["total"] == 0 + assert data["has_more"] is False + + async def test_status_filter(self, mock_pool): + mock_pool.fetch.return_value = [] + mock_pool.fetchval.return_value = 0 + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/pages/example.com?status=active") + + assert response.status_code == 200 + + fetch_call = mock_pool.fetch.call_args + query = fetch_call[0][0] + assert "wu.status = $2" in query + + params = fetch_call[0][1:] + assert params[0] == "example.com" + assert params[1] == "active" + + async def test_has_more_true(self, mock_pool): + mock_pool.fetch.return_value = [_make_row(url="https://example.com/p1")] + mock_pool.fetchval.return_value = 50 + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/pages/example.com?offset=0&limit=10") + + assert response.status_code == 200 + data = response.json() + assert data["has_more"] is True + assert data["total"] == 50 + assert data["offset"] == 0 + + async def test_has_more_false_at_end(self, mock_pool): + mock_pool.fetch.return_value = [_make_row(url="https://example.com/p1")] + mock_pool.fetchval.return_value = 50 + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/pages/example.com?offset=40&limit=10") + + assert response.status_code == 200 + data = response.json() + assert data["has_more"] is False + + async def test_sort_param(self, mock_pool): + mock_pool.fetch.return_value = [] + mock_pool.fetchval.return_value = 0 + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/pages/example.com?sort=word_count") + + assert response.status_code == 200 + + query = mock_pool.fetch.call_args[0][0] + assert "ORDER BY wu.word_count DESC" in query + + async def test_invalid_sort_falls_back(self, mock_pool): + mock_pool.fetch.return_value = [] + mock_pool.fetchval.return_value = 0 + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/pages/example.com?sort=invalid_field") + + assert response.status_code == 200 + + query = mock_pool.fetch.call_args[0][0] + assert "ORDER BY wu.last_crawled_at DESC" in query + + async def test_pagination_params_passed(self, mock_pool): + mock_pool.fetch.return_value = [] + mock_pool.fetchval.return_value = 0 + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/pages/example.com?offset=20&limit=50") + + assert response.status_code == 200 + + fetch_call = mock_pool.fetch.call_args + params = fetch_call[0][1:] + assert 50 in params + assert 20 in params + + async def test_null_timestamps(self, mock_pool): + mock_pool.fetch.return_value = [ + _make_row(url="https://example.com/new", last_crawled_at=None, discovered_at=None), + ] + mock_pool.fetchval.return_value = 1 + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/pages/example.com") + + assert response.status_code == 200 + page = response.json()["pages"][0] + assert page["last_crawled_at"] is None + assert page["discovered_at"] is None + + async def test_null_word_count_defaults_to_zero(self, mock_pool): + mock_pool.fetch.return_value = [_make_row(url="https://example.com/empty", word_count=None)] + mock_pool.fetchval.return_value = 1 + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/pages/example.com") + + assert response.status_code == 200 + assert response.json()["pages"][0]["word_count"] == 0 + + async def test_500_on_database_error(self, mock_pool): + mock_pool.fetch.side_effect = RuntimeError("connection lost") + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/pages/example.com") + + assert response.status_code == 500 + assert response.json()["detail"] == "Failed to list pages" diff --git a/services/crawler/tests/test_search_router.py b/services/crawler/tests/test_search_router.py new file mode 100644 index 0000000000..cc812bf970 --- /dev/null +++ b/services/crawler/tests/test_search_router.py @@ -0,0 +1,129 @@ +from unittest.mock import AsyncMock + +import pytest +from fastapi import FastAPI +from httpx import ASGITransport, AsyncClient + +from app.routers.search import router, set_search_service +from app.services.search_service import SearchResult + +app = FastAPI() +app.include_router(router) + + +@pytest.fixture +def mock_search_service(): + service = AsyncMock() + set_search_service(service) + yield service + set_search_service(None) + + +class TestSearchAll: + async def test_returns_results(self, mock_search_service): + mock_search_service.search.return_value = [ + SearchResult( + url="https://example.com/page1", title="Page 1", chunk_content="Hello world", chunk_index=0, score=0.95 + ), + SearchResult( + url="https://example.com/page2", + title="Page 2", + chunk_content="Goodbye world", + chunk_index=1, + score=0.80, + ), + ] + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/api/v1/search", json={"query": "hello", "limit": 10}) + + assert response.status_code == 200 + data = response.json() + assert data["query"] == "hello" + assert data["total"] == 2 + assert len(data["results"]) == 2 + assert data["results"][0]["url"] == "https://example.com/page1" + assert data["results"][0]["score"] == 0.95 + mock_search_service.search.assert_awaited_once_with(query="hello", limit=10) + + async def test_returns_empty_results(self, mock_search_service): + mock_search_service.search.return_value = [] + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/api/v1/search", json={"query": "nonexistent"}) + + assert response.status_code == 200 + data = response.json() + assert data["query"] == "nonexistent" + assert data["total"] == 0 + assert data["results"] == [] + + async def test_uses_default_limit(self, mock_search_service): + mock_search_service.search.return_value = [] + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + await client.post("/api/v1/search", json={"query": "test"}) + + mock_search_service.search.assert_awaited_once_with(query="test", limit=10) + + async def test_503_when_service_not_initialized(self): + set_search_service(None) + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/api/v1/search", json={"query": "test"}) + + assert response.status_code == 503 + assert response.json()["detail"] == "Search service not initialized" + + async def test_500_on_unexpected_error(self, mock_search_service): + mock_search_service.search.side_effect = RuntimeError("db gone") + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/api/v1/search", json={"query": "boom"}) + + assert response.status_code == 500 + assert response.json()["detail"] == "Search failed" + + +class TestSearchDomain: + async def test_routes_domain_correctly(self, mock_search_service): + mock_search_service.search.return_value = [ + SearchResult( + url="https://docs.example.com/intro", title="Intro", chunk_content="Welcome", chunk_index=0, score=1.0 + ), + ] + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/api/v1/search/docs.example.com", json={"query": "welcome", "limit": 5}) + + assert response.status_code == 200 + data = response.json() + assert data["total"] == 1 + assert data["results"][0]["url"] == "https://docs.example.com/intro" + mock_search_service.search.assert_awaited_once_with(query="welcome", domain="docs.example.com", limit=5) + + async def test_empty_domain_results(self, mock_search_service): + mock_search_service.search.return_value = [] + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/api/v1/search/unknown.com", json={"query": "anything"}) + + assert response.status_code == 200 + assert response.json()["total"] == 0 + + async def test_503_when_service_not_initialized(self): + set_search_service(None) + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/api/v1/search/example.com", json={"query": "test"}) + + assert response.status_code == 503 + + async def test_500_on_unexpected_error(self, mock_search_service): + mock_search_service.search.side_effect = RuntimeError("oops") + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post("/api/v1/search/example.com", json={"query": "fail"}) + + assert response.status_code == 500 + assert response.json()["detail"] == "Search failed" diff --git a/services/crawler/tests/test_search_service.py b/services/crawler/tests/test_search_service.py new file mode 100644 index 0000000000..57db9641df --- /dev/null +++ b/services/crawler/tests/test_search_service.py @@ -0,0 +1,174 @@ +"""Tests for SearchService RRF merge logic.""" + +import pytest + +from app.services.search_service import RRF_K, SearchResult, SearchService + + +def _item(id, url="https://example.com", title="Title", chunk_content="content", chunk_index=0): + return {"id": id, "url": url, "title": title, "chunk_content": chunk_content, "chunk_index": chunk_index} + + +class TestMergeRrfEmptyInput: + def test_no_ranked_lists(self): + assert SearchService._merge_rrf([], limit=10) == [] + + def test_single_empty_list(self): + assert SearchService._merge_rrf([[]], limit=10) == [] + + def test_multiple_empty_lists(self): + assert SearchService._merge_rrf([[], []], limit=10) == [] + + +class TestMergeRrfSingleList: + def test_single_list_returns_all_items(self): + items = [_item(1, url="https://a.com"), _item(2, url="https://b.com")] + results = SearchService._merge_rrf([items], limit=10) + assert len(results) == 2 + assert results[0].url == "https://a.com" + assert results[1].url == "https://b.com" + + def test_single_list_preserves_rank_order(self): + items = [_item(10), _item(20), _item(30)] + results = SearchService._merge_rrf([items], limit=10) + assert [r.score for r in results] == sorted([r.score for r in results], reverse=True) + + def test_single_item(self): + results = SearchService._merge_rrf([[_item(1, chunk_content="hello")]], limit=10) + assert len(results) == 1 + assert results[0].chunk_content == "hello" + + +class TestMergeRrfOverlappingItems: + def test_overlapping_item_boosted_above_disjoint(self): + list_a = [_item(1), _item(2)] + list_b = [_item(1), _item(3)] + results = SearchService._merge_rrf([list_a, list_b], limit=10) + assert results[0].url == "https://example.com" + ids_by_score = [r for r in results] + assert ids_by_score[0].score > ids_by_score[1].score + + def test_overlapping_item_score_equals_sum_of_rrf(self): + list_a = [_item(1)] + list_b = [_item(1)] + results = SearchService._merge_rrf([list_a, list_b], limit=10) + expected_raw = 2 * (1.0 / (RRF_K + 0 + 1)) + assert len(results) == 1 + assert results[0].score == pytest.approx(1.0) + assert expected_raw == pytest.approx(expected_raw) + + def test_overlapping_at_different_ranks(self): + list_a = [_item(1), _item(2)] + list_b = [_item(2), _item(1)] + results = SearchService._merge_rrf([list_a, list_b], limit=10) + assert len(results) == 2 + assert results[0].score == results[1].score + + +class TestMergeRrfDisjointItems: + def test_disjoint_lists_merged(self): + list_a = [_item(1, url="https://a.com")] + list_b = [_item(2, url="https://b.com")] + results = SearchService._merge_rrf([list_a, list_b], limit=10) + assert len(results) == 2 + urls = {r.url for r in results} + assert urls == {"https://a.com", "https://b.com"} + + def test_disjoint_same_rank_have_equal_scores(self): + list_a = [_item(1)] + list_b = [_item(2)] + results = SearchService._merge_rrf([list_a, list_b], limit=10) + assert results[0].score == results[1].score + + def test_disjoint_different_ranks(self): + list_a = [_item(1), _item(2)] + list_b = [_item(3)] + results = SearchService._merge_rrf([list_a, list_b], limit=10) + rank0_score = 1.0 / (RRF_K + 0 + 1) + rank1_score = 1.0 / (RRF_K + 1 + 1) + top = [r for r in results if r.score == pytest.approx(1.0)] + assert len(top) == 2 + bottom = [r for r in results if r.score < 1.0] + assert len(bottom) == 1 + assert bottom[0].score == pytest.approx(rank1_score / rank0_score) + + +class TestMergeRrfLimitTruncation: + def test_limit_truncates_results(self): + items = [_item(i) for i in range(20)] + results = SearchService._merge_rrf([items], limit=5) + assert len(results) == 5 + + def test_limit_larger_than_items_returns_all(self): + items = [_item(1), _item(2)] + results = SearchService._merge_rrf([items], limit=100) + assert len(results) == 2 + + def test_limit_zero_returns_empty(self): + items = [_item(1), _item(2)] + results = SearchService._merge_rrf([items], limit=0) + assert results == [] + + def test_limit_one_returns_top_result(self): + list_a = [_item(1), _item(2)] + list_b = [_item(1), _item(3)] + results = SearchService._merge_rrf([list_a, list_b], limit=1) + assert len(results) == 1 + + +class TestMergeRrfScoreNormalization: + def test_top_result_always_has_score_one(self): + items = [_item(i) for i in range(5)] + results = SearchService._merge_rrf([items], limit=10) + assert results[0].score == pytest.approx(1.0) + + def test_top_result_score_one_with_multiple_lists(self): + list_a = [_item(1), _item(2), _item(3)] + list_b = [_item(4), _item(1), _item(5)] + results = SearchService._merge_rrf([list_a, list_b], limit=10) + assert results[0].score == pytest.approx(1.0) + + def test_scores_are_between_zero_and_one(self): + list_a = [_item(i) for i in range(10)] + list_b = [_item(i + 5) for i in range(10)] + results = SearchService._merge_rrf([list_a, list_b], limit=20) + for r in results: + assert 0.0 < r.score <= 1.0 + + def test_normalized_scores_preserve_relative_order(self): + list_a = [_item(1), _item(2), _item(3)] + results = SearchService._merge_rrf([list_a], limit=10) + scores = [r.score for r in results] + assert scores == sorted(scores, reverse=True) + + +class TestMergeRrfFieldMapping: + def test_fields_mapped_correctly(self): + item = _item(42, url="https://test.dev", title="Test Page", chunk_content="some text", chunk_index=3) + results = SearchService._merge_rrf([[item]], limit=10) + assert len(results) == 1 + r = results[0] + assert r.url == "https://test.dev" + assert r.title == "Test Page" + assert r.chunk_content == "some text" + assert r.chunk_index == 3 + + def test_title_can_be_none(self): + item = {"id": 1, "url": "https://x.com", "title": None, "chunk_content": "c", "chunk_index": 0} + results = SearchService._merge_rrf([[item]], limit=10) + assert results[0].title is None + + def test_missing_title_key_defaults_to_none(self): + item = {"id": 1, "url": "https://x.com", "chunk_content": "c", "chunk_index": 0} + results = SearchService._merge_rrf([[item]], limit=10) + assert results[0].title is None + + def test_returns_search_result_instances(self): + results = SearchService._merge_rrf([[_item(1)]], limit=10) + assert isinstance(results[0], SearchResult) + + def test_later_list_overwrites_item_metadata(self): + item_v1 = _item(1, title="Old Title") + item_v2 = _item(1, title="New Title") + results = SearchService._merge_rrf([[item_v1], [item_v2]], limit=10) + assert results[0].title == "New Title" diff --git a/services/crawler/tests/test_website_store.py b/services/crawler/tests/test_website_store.py deleted file mode 100644 index a3849aa8dc..0000000000 --- a/services/crawler/tests/test_website_store.py +++ /dev/null @@ -1,701 +0,0 @@ -""" -Tests for WebsiteStore and WebsiteStoreManager. - -Uses importlib to load website_store directly, bypassing the app.services -barrel __init__.py which pulls in heavy dependencies (playwright, crawl4ai). -""" - -import importlib.util -import sys -from pathlib import Path - -import pytest - -# Load website_store module directly to avoid app.services.__init__ barrel import -_module_path = Path(__file__).resolve().parent.parent / "app" / "services" / "website_store.py" -_spec = importlib.util.spec_from_file_location("website_store", _module_path) -_mod = importlib.util.module_from_spec(_spec) -sys.modules["website_store"] = _mod -_spec.loader.exec_module(_mod) - -WebsiteStore = _mod.WebsiteStore -WebsiteStoreManager = _mod.WebsiteStoreManager -_sanitize_domain = _mod._sanitize_domain - - -@pytest.fixture -def tmp_data_dir(tmp_path): - return tmp_path / "data" - - -@pytest.fixture -def site_store(tmp_path): - db_path = tmp_path / "sites" / "example_com.db" - store = WebsiteStore(db_path) - yield store - store.close() - - -@pytest.fixture -def manager(tmp_data_dir): - mgr = WebsiteStoreManager(data_dir=tmp_data_dir) - yield mgr - mgr.close_all() - - -class TestSanitizeDomain: - def test_replaces_dots_and_hyphens(self): - assert _sanitize_domain("my-site.example.com") == "my_site_example_com" - - def test_no_special_chars(self): - assert _sanitize_domain("localhost") == "localhost" - - -class TestWebsiteStore: - def test_creates_db_file(self, tmp_path): - db_path = tmp_path / "nested" / "dir" / "test.db" - store = WebsiteStore(db_path) - assert db_path.exists() - store.close() - - def test_save_discovered_urls(self, site_store): - urls = [{"url": "https://example.com/a"}, {"url": "https://example.com/b"}] - inserted = site_store.save_discovered_urls(urls) - assert inserted >= 2 - # Discovered URLs have no content_hash yet, so not counted - assert site_store.get_total_count() == 0 - - def test_save_discovered_urls_ignores_duplicates(self, site_store): - urls = [{"url": "https://example.com/a"}] - site_store.save_discovered_urls(urls) - site_store.save_discovered_urls(urls) - assert site_store.get_total_count() == 0 - - def test_save_discovered_urls_empty(self, site_store): - assert site_store.save_discovered_urls([]) == 0 - - def test_get_urls_page_excludes_null_hash(self, site_store): - site_store.save_discovered_urls([{"url": "https://example.com/a"}]) - assert site_store.get_urls_page() == [] - - def test_get_urls_page_basic(self, site_store): - urls = [{"url": f"https://example.com/{i}"} for i in range(5)] - site_store.save_discovered_urls(urls) - site_store.update_content_hashes( - [{"url": f"https://example.com/{i}", "content_hash": f"h{i}"} for i in range(5)] - ) - - page = site_store.get_urls_page(offset=0, limit=3) - assert len(page) == 3 - assert all("url" in p and "content_hash" in p and "status" in p for p in page) - - def test_get_urls_page_offset(self, site_store): - urls = [{"url": f"https://example.com/{i}"} for i in range(5)] - site_store.save_discovered_urls(urls) - site_store.update_content_hashes( - [{"url": f"https://example.com/{i}", "content_hash": f"h{i}"} for i in range(5)] - ) - - page = site_store.get_urls_page(offset=3, limit=10) - assert len(page) == 2 - - def test_get_urls_page_with_status_filter(self, site_store): - urls = [{"url": "https://example.com/a"}, {"url": "https://example.com/b"}] - site_store.save_discovered_urls(urls) - - site_store.update_content_hashes( - [ - {"url": "https://example.com/a", "content_hash": "abc", "status": "active"}, - ] - ) - - active = site_store.get_urls_page(status="active") - assert len(active) == 1 - assert active[0]["url"] == "https://example.com/a" - - # discovered URL has no hash, so excluded - discovered = site_store.get_urls_page(status="discovered") - assert len(discovered) == 0 - - def test_update_content_hashes(self, site_store): - site_store.save_discovered_urls([{"url": "https://example.com/page"}]) - - site_store.update_content_hashes( - [ - {"url": "https://example.com/page", "content_hash": "sha256abc"}, - ] - ) - - pages = site_store.get_urls_page() - assert len(pages) == 1 - assert pages[0]["content_hash"] == "sha256abc" - assert pages[0]["status"] == "active" - assert pages[0]["last_crawled_at"] is not None - - def test_update_content_hashes_empty(self, site_store): - # Should not raise - site_store.update_content_hashes([]) - - def test_mark_urls_deleted(self, site_store): - site_store.save_discovered_urls( - [ - {"url": "https://example.com/a"}, - {"url": "https://example.com/b"}, - ] - ) - site_store.update_content_hashes( - [ - {"url": "https://example.com/a", "content_hash": "ha", "status": "active"}, - {"url": "https://example.com/b", "content_hash": "hb", "status": "active"}, - ] - ) - - site_store.mark_urls_deleted(["https://example.com/a"]) - - deleted = site_store.get_urls_page(status="deleted") - assert len(deleted) == 1 - assert deleted[0]["url"] == "https://example.com/a" - - active = site_store.get_urls_page(status="active") - assert len(active) == 1 - - def test_mark_urls_deleted_empty(self, site_store): - site_store.mark_urls_deleted([]) - - def test_get_urls_needing_recrawl_prefers_no_hash(self, site_store): - site_store.save_discovered_urls( - [ - {"url": "https://example.com/new"}, - {"url": "https://example.com/old"}, - ] - ) - site_store.update_content_hashes( - [ - {"url": "https://example.com/old", "content_hash": "h1"}, - ] - ) - - needing = site_store.get_urls_needing_recrawl(limit=10) - assert len(needing) == 2 - # URL without hash should come first - assert needing[0] == "https://example.com/new" - - def test_get_urls_needing_recrawl_excludes_deleted(self, site_store): - site_store.save_discovered_urls( - [ - {"url": "https://example.com/a"}, - {"url": "https://example.com/b"}, - ] - ) - site_store.mark_urls_deleted(["https://example.com/a"]) - - needing = site_store.get_urls_needing_recrawl(limit=10) - assert len(needing) == 1 - assert needing[0] == "https://example.com/b" - - def test_get_urls_needing_recrawl_respects_limit(self, site_store): - urls = [{"url": f"https://example.com/{i}"} for i in range(10)] - site_store.save_discovered_urls(urls) - - needing = site_store.get_urls_needing_recrawl(limit=3) - assert len(needing) == 3 - - def test_get_urls_needing_recrawl_crawled_before_excludes_recent(self, site_store): - import time - - site_store.save_discovered_urls( - [ - {"url": "https://example.com/a"}, - {"url": "https://example.com/b"}, - ] - ) - cutoff = time.time() - # Crawl one URL after the cutoff - site_store.update_content_hashes([{"url": "https://example.com/a", "content_hash": "h1"}]) - - needing = site_store.get_urls_needing_recrawl(limit=10, crawled_before=cutoff) - assert needing == ["https://example.com/b"] - - def test_get_urls_needing_recrawl_crawled_before_includes_stale(self, site_store): - import time - - site_store.save_discovered_urls( - [ - {"url": "https://example.com/a"}, - {"url": "https://example.com/b"}, - ] - ) - # Crawl both URLs - site_store.update_content_hashes( - [ - {"url": "https://example.com/a", "content_hash": "h1"}, - {"url": "https://example.com/b", "content_hash": "h2"}, - ] - ) - cutoff = time.time() - - # Both were crawled before cutoff, so both should be returned - needing = site_store.get_urls_needing_recrawl(limit=10, crawled_before=cutoff) - assert len(needing) == 2 - - def test_increment_fail_count(self, site_store): - site_store.save_discovered_urls([{"url": "https://example.com/flaky"}]) - - site_store.increment_fail_count(["https://example.com/flaky"]) - conn = site_store._get_conn() - row = conn.execute( - "SELECT fail_count, last_crawled_at FROM website_urls WHERE url = ?", - ("https://example.com/flaky",), - ).fetchone() - assert row["fail_count"] == 1 - assert row["last_crawled_at"] is not None - - def test_increment_fail_count_accumulates(self, site_store): - site_store.save_discovered_urls([{"url": "https://example.com/flaky"}]) - - site_store.increment_fail_count(["https://example.com/flaky"]) - site_store.increment_fail_count(["https://example.com/flaky"]) - site_store.increment_fail_count(["https://example.com/flaky"]) - - conn = site_store._get_conn() - row = conn.execute( - "SELECT fail_count FROM website_urls WHERE url = ?", - ("https://example.com/flaky",), - ).fetchone() - assert row["fail_count"] == 3 - - def test_increment_fail_count_empty(self, site_store): - site_store.increment_fail_count([]) - - def test_successful_crawl_resets_fail_count(self, site_store): - site_store.save_discovered_urls([{"url": "https://example.com/flaky"}]) - site_store.increment_fail_count(["https://example.com/flaky"]) - site_store.increment_fail_count(["https://example.com/flaky"]) - - site_store.update_content_hashes([{"url": "https://example.com/flaky", "content_hash": "h1"}]) - - conn = site_store._get_conn() - row = conn.execute( - "SELECT fail_count FROM website_urls WHERE url = ?", - ("https://example.com/flaky",), - ).fetchone() - assert row["fail_count"] == 0 - - def test_increment_fail_count_sets_last_crawled_at_for_session_scoping(self, site_store): - import time - - site_store.save_discovered_urls([{"url": "https://example.com/fail"}]) - cutoff = time.time() - - # Increment fail count (sets last_crawled_at to now, which is after cutoff) - site_store.increment_fail_count(["https://example.com/fail"]) - - # URL should no longer appear in this scan session - needing = site_store.get_urls_needing_recrawl(limit=10, crawled_before=cutoff) - assert needing == [] - - def test_get_total_count(self, site_store): - assert site_store.get_total_count() == 0 - - site_store.save_discovered_urls([{"url": "https://example.com/a"}]) - # No hash yet, so count is still 0 - assert site_store.get_total_count() == 0 - - site_store.update_content_hashes([{"url": "https://example.com/a", "content_hash": "h1"}]) - assert site_store.get_total_count() == 1 - - def test_get_total_count_with_status(self, site_store): - site_store.save_discovered_urls( - [ - {"url": "https://example.com/a"}, - {"url": "https://example.com/b"}, - ] - ) - site_store.update_content_hashes( - [ - {"url": "https://example.com/a", "content_hash": "ha", "status": "active"}, - {"url": "https://example.com/b", "content_hash": "hb", "status": "active"}, - ] - ) - site_store.mark_urls_deleted(["https://example.com/a"]) - - assert site_store.get_total_count(status="deleted") == 1 - assert site_store.get_total_count(status="active") == 1 - - def test_update_content_hashes_with_page_data(self, site_store): - site_store.save_discovered_urls([{"url": "https://example.com/page"}]) - - site_store.update_content_hashes( - [ - { - "url": "https://example.com/page", - "content_hash": "sha256abc", - "status": "active", - "title": "Test Page", - "content": "# Hello World\n\nSome content here.", - "word_count": 5, - "metadata": '{"author": "test"}', - "structured_data": '{"og:title": "Test"}', - }, - ] - ) - - cached = site_store.get_cached_pages(["https://example.com/page"]) - assert len(cached) == 1 - assert cached[0]["url"] == "https://example.com/page" - assert cached[0]["title"] == "Test Page" - assert cached[0]["content"] == "# Hello World\n\nSome content here." - assert cached[0]["word_count"] == 5 - assert cached[0]["metadata"] == {"author": "test"} - assert cached[0]["structured_data"] == {"og:title": "Test"} - - def test_update_content_hashes_without_page_data(self, site_store): - site_store.save_discovered_urls([{"url": "https://example.com/page"}]) - - site_store.update_content_hashes([{"url": "https://example.com/page", "content_hash": "sha256abc"}]) - - cached = site_store.get_cached_pages(["https://example.com/page"]) - assert len(cached) == 0 - - def test_get_cached_pages_hit(self, site_store): - site_store.save_discovered_urls([{"url": "https://example.com/a"}]) - site_store.update_content_hashes( - [ - { - "url": "https://example.com/a", - "content_hash": "h1", - "content": "Page A content", - "title": "Page A", - "word_count": 3, - }, - ] - ) - - cached = site_store.get_cached_pages(["https://example.com/a"]) - assert len(cached) == 1 - assert cached[0]["content"] == "Page A content" - assert cached[0]["title"] == "Page A" - - def test_get_cached_pages_miss(self, site_store): - cached = site_store.get_cached_pages(["https://example.com/nonexistent"]) - assert len(cached) == 0 - - def test_get_cached_pages_mixed(self, site_store): - site_store.save_discovered_urls([{"url": "https://example.com/a"}, {"url": "https://example.com/b"}]) - site_store.update_content_hashes( - [ - { - "url": "https://example.com/a", - "content_hash": "h1", - "content": "Page A", - "word_count": 2, - }, - {"url": "https://example.com/b", "content_hash": "h2"}, - ] - ) - - cached = site_store.get_cached_pages( - ["https://example.com/a", "https://example.com/b", "https://example.com/c"] - ) - assert len(cached) == 1 - assert cached[0]["url"] == "https://example.com/a" - - def test_get_cached_pages_empty_input(self, site_store): - assert site_store.get_cached_pages([]) == [] - - def test_get_cached_pages_null_metadata(self, site_store): - site_store.save_discovered_urls([{"url": "https://example.com/a"}]) - site_store.update_content_hashes( - [ - { - "url": "https://example.com/a", - "content_hash": "h1", - "content": "Some content", - "word_count": 2, - }, - ] - ) - - cached = site_store.get_cached_pages(["https://example.com/a"]) - assert cached[0]["metadata"] is None - assert cached[0]["structured_data"] is None - - def test_close_and_reopen(self, tmp_path): - db_path = tmp_path / "test.db" - store = WebsiteStore(db_path) - store.save_discovered_urls([{"url": "https://example.com/persist"}]) - store.update_content_hashes([{"url": "https://example.com/persist", "content_hash": "h1"}]) - store.close() - - store2 = WebsiteStore(db_path) - assert store2.get_total_count() == 1 - pages = store2.get_urls_page() - assert pages[0]["url"] == "https://example.com/persist" - store2.close() - - def test_schema_migration_adds_fail_count(self, tmp_path): - db_path = tmp_path / "legacy_fc.db" - import sqlite3 - - conn = sqlite3.connect(str(db_path)) - conn.executescript(""" - CREATE TABLE website_urls ( - url TEXT PRIMARY KEY, - content_hash TEXT, - status TEXT NOT NULL DEFAULT 'discovered', - last_crawled_at REAL, - discovered_at REAL NOT NULL, - title TEXT, - content TEXT, - word_count INTEGER, - metadata TEXT, - structured_data TEXT - ); - """) - conn.execute( - "INSERT INTO website_urls (url, discovered_at) VALUES (?, ?)", - ("https://example.com/old", 1000.0), - ) - conn.commit() - conn.close() - - store = WebsiteStore(db_path) - # fail_count column should exist after migration - c = store._get_conn() - row = c.execute( - "SELECT fail_count FROM website_urls WHERE url = ?", - ("https://example.com/old",), - ).fetchone() - assert row["fail_count"] == 0 - store.close() - - def test_schema_migration_adds_columns(self, tmp_path): - db_path = tmp_path / "legacy.db" - import sqlite3 - - conn = sqlite3.connect(str(db_path)) - conn.executescript(""" - CREATE TABLE website_urls ( - url TEXT PRIMARY KEY, - content_hash TEXT, - status TEXT NOT NULL DEFAULT 'discovered', - last_crawled_at REAL, - discovered_at REAL NOT NULL - ); - """) - conn.execute( - "INSERT INTO website_urls (url, discovered_at) VALUES (?, ?)", - ("https://example.com/old", 1000.0), - ) - conn.commit() - conn.close() - - store = WebsiteStore(db_path) - store.update_content_hashes( - [ - { - "url": "https://example.com/old", - "content_hash": "h1", - "content": "Migrated content", - "title": "Old Page", - "word_count": 2, - }, - ] - ) - cached = store.get_cached_pages(["https://example.com/old"]) - assert len(cached) == 1 - assert cached[0]["content"] == "Migrated content" - assert cached[0]["title"] == "Old Page" - store.close() - - -class TestWebsiteStoreManager: - def test_register_website(self, manager): - result = manager.register_website("example.com", scan_interval=3600) - assert result["domain"] == "example.com" - assert result["scan_interval"] == 3600 - - website = manager.get_website("example.com") - assert website is not None - assert website["domain"] == "example.com" - assert website["scan_interval"] == 3600 - assert website["status"] == "idle" - - def test_register_website_upsert(self, manager): - manager.register_website("example.com", scan_interval=3600) - manager.register_website("example.com", scan_interval=7200) - - website = manager.get_website("example.com") - assert website["scan_interval"] == 7200 - - def test_remove_website(self, manager, tmp_data_dir): - manager.register_website("example.com") - site_store = manager.get_site_store("example.com") - site_store.save_discovered_urls([{"url": "https://example.com/a"}]) - - db_file = tmp_data_dir / "sites" / "example_com.db" - assert db_file.exists() - - removed = manager.remove_website("example.com") - assert removed is True - assert not db_file.exists() - assert manager.get_website("example.com") is None - - def test_remove_website_not_found(self, manager): - removed = manager.remove_website("nonexistent.com") - assert removed is False - - def test_get_due_websites_none_scanned(self, manager): - manager.register_website("a.com") - manager.register_website("b.com") - - due = manager.get_due_websites() - domains = [w["domain"] for w in due] - assert "a.com" in domains - assert "b.com" in domains - - def test_get_due_websites_excludes_scanning(self, manager): - manager.register_website("a.com") - manager.update_scan_status("a.com", "scanning") - - due = manager.get_due_websites() - assert len(due) == 0 - - def test_get_due_websites_excludes_recently_scanned(self, manager): - manager.register_website("a.com", scan_interval=3600) - manager.update_last_scanned("a.com") - - due = manager.get_due_websites() - assert len(due) == 0 - - def test_update_scan_status(self, manager): - manager.register_website("a.com") - manager.update_scan_status("a.com", "error", error="timeout") - - website = manager.get_website("a.com") - assert website["status"] == "error" - assert website["error"] == "timeout" - - def test_update_scan_status_clears_error(self, manager): - manager.register_website("a.com") - manager.update_scan_status("a.com", "error", error="timeout") - manager.update_scan_status("a.com", "idle") - - website = manager.get_website("a.com") - assert website["status"] == "idle" - assert website["error"] is None - - def test_get_site_store_creates_and_caches(self, manager): - store1 = manager.get_site_store("example.com") - store2 = manager.get_site_store("example.com") - assert store1 is store2 - - def test_get_site_store_different_domains(self, manager): - store_a = manager.get_site_store("a.com") - store_b = manager.get_site_store("b.com") - assert store_a is not store_b - - def test_site_store_isolation(self, manager): - store_a = manager.get_site_store("a.com") - store_b = manager.get_site_store("b.com") - - store_a.save_discovered_urls([{"url": "https://a.com/page"}]) - store_a.update_content_hashes([{"url": "https://a.com/page", "content_hash": "ha"}]) - - store_b.save_discovered_urls([{"url": "https://b.com/page1"}, {"url": "https://b.com/page2"}]) - store_b.update_content_hashes( - [ - {"url": "https://b.com/page1", "content_hash": "hb1"}, - {"url": "https://b.com/page2", "content_hash": "hb2"}, - ] - ) - - assert store_a.get_total_count() == 1 - assert store_b.get_total_count() == 2 - - def test_get_website_not_found(self, manager): - assert manager.get_website("nonexistent.com") is None - - def test_close_all(self, manager): - manager.register_website("a.com") - manager.get_site_store("a.com") - - manager.close_all() - # After close_all, internal state should be cleared - assert len(manager._stores) == 0 - assert manager._main_conn is None - - def test_removes_wal_and_shm_files(self, manager, tmp_data_dir): - manager.register_website("example.com") - site_store = manager.get_site_store("example.com") - site_store.save_discovered_urls([{"url": "https://example.com/a"}]) - - db_file = tmp_data_dir / "sites" / "example_com.db" - # WAL files may or may not exist depending on SQLite behavior, - # but remove_website should handle them gracefully - manager.remove_website("example.com") - assert not db_file.exists() - assert not db_file.with_suffix(".db-wal").exists() - assert not db_file.with_suffix(".db-shm").exists() - - def test_get_cached_pages_registered_domain(self, manager): - manager.register_website("example.com") - store = manager.get_site_store("example.com") - store.save_discovered_urls([{"url": "https://example.com/page"}]) - store.update_content_hashes( - [ - { - "url": "https://example.com/page", - "content_hash": "h1", - "content": "Cached content", - "title": "Cached", - "word_count": 2, - }, - ] - ) - - cached, to_crawl = manager.get_cached_pages(["https://example.com/page"]) - assert len(cached) == 1 - assert cached[0]["content"] == "Cached content" - assert len(to_crawl) == 0 - - def test_get_cached_pages_unregistered_domain(self, manager): - cached, to_crawl = manager.get_cached_pages(["https://unknown.com/page"]) - assert len(cached) == 0 - assert to_crawl == ["https://unknown.com/page"] - - def test_get_cached_pages_mixed_domains(self, manager): - manager.register_website("a.com") - store = manager.get_site_store("a.com") - store.save_discovered_urls([{"url": "https://a.com/page"}]) - store.update_content_hashes( - [ - { - "url": "https://a.com/page", - "content_hash": "h1", - "content": "Page A", - "word_count": 2, - }, - ] - ) - - cached, to_crawl = manager.get_cached_pages(["https://a.com/page", "https://b.com/other"]) - assert len(cached) == 1 - assert cached[0]["url"] == "https://a.com/page" - assert to_crawl == ["https://b.com/other"] - - def test_get_cached_pages_cache_miss_on_registered_domain(self, manager): - manager.register_website("example.com") - store = manager.get_site_store("example.com") - store.save_discovered_urls([{"url": "https://example.com/a"}]) - # Hash only, no content - store.update_content_hashes([{"url": "https://example.com/a", "content_hash": "h1"}]) - - cached, to_crawl = manager.get_cached_pages(["https://example.com/a"]) - assert len(cached) == 0 - assert to_crawl == ["https://example.com/a"] - - def test_get_cached_pages_empty_input(self, manager): - cached, to_crawl = manager.get_cached_pages([]) - assert cached == [] - assert to_crawl == [] diff --git a/services/crawler/tests/test_websites_router.py b/services/crawler/tests/test_websites_router.py new file mode 100644 index 0000000000..a46a0eb890 --- /dev/null +++ b/services/crawler/tests/test_websites_router.py @@ -0,0 +1,254 @@ +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from fastapi import FastAPI +from httpx import ASGITransport, AsyncClient + +from app.routers.websites import router + +app = FastAPI() +app.include_router(router) + + +@pytest.fixture +def mock_manager(): + manager = AsyncMock() + manager.get_site_store = MagicMock() + app.state.pg_store_manager = manager + yield manager + del app.state.pg_store_manager + + +class TestRegisterWebsite: + async def test_success(self, mock_manager): + mock_manager.register_website.return_value = { + "domain": "example.com", + "status": "idle", + "scan_interval": 21600, + } + + with patch("app.routers.websites.trigger_scan") as mock_trigger: + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post( + "/api/v1/websites", + json={"domain": "example.com", "scan_interval": 21600}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["domain"] == "example.com" + assert data["status"] == "idle" + assert data["scan_interval"] == 21600 + mock_manager.register_website.assert_awaited_once_with( + domain="example.com", + scan_interval=21600, + ) + mock_trigger.assert_called_once() + + async def test_uses_default_scan_interval(self, mock_manager): + mock_manager.register_website.return_value = { + "domain": "example.com", + "status": "idle", + "scan_interval": 21600, + } + + with patch("app.routers.websites.trigger_scan"): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post( + "/api/v1/websites", + json={"domain": "example.com"}, + ) + + assert response.status_code == 200 + mock_manager.register_website.assert_awaited_once_with( + domain="example.com", + scan_interval=21600, + ) + + async def test_500_on_error(self, mock_manager): + mock_manager.register_website.side_effect = RuntimeError("db error") + + with patch("app.routers.websites.trigger_scan"): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post( + "/api/v1/websites", + json={"domain": "example.com"}, + ) + + assert response.status_code == 500 + assert response.json()["detail"] == "Failed to register website" + + +class TestGetWebsiteInfo: + async def test_success(self, mock_manager): + mock_manager.get_website.return_value = { + "domain": "example.com", + "title": "Example", + "description": "An example site", + "page_count": 42, + "status": "active", + "scan_interval": 3600, + "last_scanned_at": 1700000000.0, + "error": None, + "created_at": 1699000000.0, + "updated_at": 1700000000.0, + } + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/websites/example.com") + + assert response.status_code == 200 + data = response.json() + assert data["domain"] == "example.com" + assert data["title"] == "Example" + assert data["description"] == "An example site" + assert data["page_count"] == 42 + assert data["status"] == "active" + assert data["scan_interval"] == 3600 + assert data["last_scanned_at"] is not None + assert data["error"] is None + assert data["created_at"] is not None + assert data["updated_at"] is not None + mock_manager.get_website.assert_awaited_once_with("example.com") + + async def test_404_when_not_found(self, mock_manager): + mock_manager.get_website.return_value = None + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/websites/unknown.com") + + assert response.status_code == 404 + assert response.json()["detail"] == "Website not found: unknown.com" + + async def test_500_on_error(self, mock_manager): + mock_manager.get_website.side_effect = RuntimeError("db error") + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/websites/example.com") + + assert response.status_code == 500 + assert response.json()["detail"] == "Failed to get website info" + + +class TestDeregisterWebsite: + async def test_success(self, mock_manager): + mock_manager.remove_website.return_value = True + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.delete("/api/v1/websites/example.com") + + assert response.status_code == 200 + data = response.json() + assert data["domain"] == "example.com" + assert data["deleted"] is True + mock_manager.remove_website.assert_awaited_once_with("example.com") + + async def test_404_when_not_found(self, mock_manager): + mock_manager.remove_website.return_value = False + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.delete("/api/v1/websites/unknown.com") + + assert response.status_code == 404 + assert response.json()["detail"] == "Website not found: unknown.com" + + async def test_500_on_error(self, mock_manager): + mock_manager.remove_website.side_effect = RuntimeError("db error") + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.delete("/api/v1/websites/example.com") + + assert response.status_code == 500 + assert response.json()["detail"] == "Failed to deregister website" + + +class TestGetWebsiteUrls: + async def test_success_with_pagination(self, mock_manager): + mock_manager.get_website.return_value = {"domain": "example.com"} + mock_site_store = AsyncMock() + mock_manager.get_site_store.return_value = mock_site_store + mock_site_store.get_urls_page.return_value = [ + { + "url": "https://example.com/page1", + "content_hash": "abc123", + "status": "active", + "last_crawled_at": 1700000000.0, + }, + { + "url": "https://example.com/page2", + "content_hash": "def456", + "status": "active", + "last_crawled_at": 1700001000.0, + }, + ] + mock_site_store.get_total_count.return_value = 50 + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/websites/example.com/urls?offset=0&limit=2") + + assert response.status_code == 200 + data = response.json() + assert data["domain"] == "example.com" + assert len(data["urls"]) == 2 + assert data["urls"][0]["url"] == "https://example.com/page1" + assert data["urls"][0]["content_hash"] == "abc123" + assert data["urls"][1]["url"] == "https://example.com/page2" + assert data["total"] == 50 + assert data["offset"] == 0 + assert data["has_more"] is True + mock_site_store.get_urls_page.assert_awaited_once_with(offset=0, limit=2, status=None) + mock_site_store.get_total_count.assert_awaited_once_with(status=None) + + async def test_has_more_false_when_at_end(self, mock_manager): + mock_manager.get_website.return_value = {"domain": "example.com"} + mock_site_store = AsyncMock() + mock_manager.get_site_store.return_value = mock_site_store + mock_site_store.get_urls_page.return_value = [ + { + "url": "https://example.com/last", + "content_hash": "xyz", + "status": "active", + "last_crawled_at": None, + }, + ] + mock_site_store.get_total_count.return_value = 1 + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/websites/example.com/urls?offset=0&limit=100") + + assert response.status_code == 200 + data = response.json() + assert data["has_more"] is False + assert data["total"] == 1 + + async def test_status_filter(self, mock_manager): + mock_manager.get_website.return_value = {"domain": "example.com"} + mock_site_store = AsyncMock() + mock_manager.get_site_store.return_value = mock_site_store + mock_site_store.get_urls_page.return_value = [] + mock_site_store.get_total_count.return_value = 0 + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/websites/example.com/urls?status=active") + + assert response.status_code == 200 + mock_site_store.get_urls_page.assert_awaited_once_with(offset=0, limit=100, status="active") + mock_site_store.get_total_count.assert_awaited_once_with(status="active") + + async def test_404_when_website_not_found(self, mock_manager): + mock_manager.get_website.return_value = None + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/websites/unknown.com/urls") + + assert response.status_code == 404 + assert response.json()["detail"] == "Website not found: unknown.com" + + async def test_500_on_error(self, mock_manager): + mock_manager.get_website.side_effect = RuntimeError("db error") + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get("/api/v1/websites/example.com/urls") + + assert response.status_code == 500 + assert response.json()["detail"] == "Failed to get website URLs" diff --git a/services/crawler/uv.lock b/services/crawler/uv.lock index 322d8992e3..8a27971eaa 100644 --- a/services/crawler/uv.lock +++ b/services/crawler/uv.lock @@ -198,6 +198,54 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/38/0e/27be9fdef66e72d64c0cdc3cc2823101b80585f8119b5c112c2e8f5f7dab/anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c", size = 113592, upload-time = "2026-01-06T11:45:19.497Z" }, ] +[[package]] +name = "asyncpg" +version = "0.31.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fe/cc/d18065ce2380d80b1bcce927c24a2642efd38918e33fd724bc4bca904877/asyncpg-0.31.0.tar.gz", hash = "sha256:c989386c83940bfbd787180f2b1519415e2d3d6277a70d9d0f0145ac73500735", size = 993667, upload-time = "2025-11-24T23:27:00.812Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/17/cc02bc49bc350623d050fa139e34ea512cd6e020562f2a7312a7bcae4bc9/asyncpg-0.31.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:eee690960e8ab85063ba93af2ce128c0f52fd655fdff9fdb1a28df01329f031d", size = 643159, upload-time = "2025-11-24T23:25:36.443Z" }, + { url = "https://files.pythonhosted.org/packages/a4/62/4ded7d400a7b651adf06f49ea8f73100cca07c6df012119594d1e3447aa6/asyncpg-0.31.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2657204552b75f8288de08ca60faf4a99a65deef3a71d1467454123205a88fab", size = 638157, upload-time = "2025-11-24T23:25:37.89Z" }, + { url = "https://files.pythonhosted.org/packages/d6/5b/4179538a9a72166a0bf60ad783b1ef16efb7960e4d7b9afe9f77a5551680/asyncpg-0.31.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a429e842a3a4b4ea240ea52d7fe3f82d5149853249306f7ff166cb9948faa46c", size = 2918051, upload-time = "2025-11-24T23:25:39.461Z" }, + { url = "https://files.pythonhosted.org/packages/e6/35/c27719ae0536c5b6e61e4701391ffe435ef59539e9360959240d6e47c8c8/asyncpg-0.31.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c0807be46c32c963ae40d329b3a686356e417f674c976c07fa49f1b30303f109", size = 2972640, upload-time = "2025-11-24T23:25:41.512Z" }, + { url = "https://files.pythonhosted.org/packages/43/f4/01ebb9207f29e645a64699b9ce0eefeff8e7a33494e1d29bb53736f7766b/asyncpg-0.31.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e5d5098f63beeae93512ee513d4c0c53dc12e9aa2b7a1af5a81cddf93fe4e4da", size = 2851050, upload-time = "2025-11-24T23:25:43.153Z" }, + { url = "https://files.pythonhosted.org/packages/3e/f4/03ff1426acc87be0f4e8d40fa2bff5c3952bef0080062af9efc2212e3be8/asyncpg-0.31.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37fc6c00a814e18eef51833545d1891cac9aa69140598bb076b4cd29b3e010b9", size = 2962574, upload-time = "2025-11-24T23:25:44.942Z" }, + { url = "https://files.pythonhosted.org/packages/c7/39/cc788dfca3d4060f9d93e67be396ceec458dfc429e26139059e58c2c244d/asyncpg-0.31.0-cp311-cp311-win32.whl", hash = "sha256:5a4af56edf82a701aece93190cc4e094d2df7d33f6e915c222fb09efbb5afc24", size = 521076, upload-time = "2025-11-24T23:25:46.486Z" }, + { url = "https://files.pythonhosted.org/packages/28/fc/735af5384c029eb7f1ca60ccb8fa95521dbdaeef788edf4cecfc604c3cab/asyncpg-0.31.0-cp311-cp311-win_amd64.whl", hash = "sha256:480c4befbdf079c14c9ca43c8c5e1fe8b6296c96f1f927158d4f1e750aacc047", size = 584980, upload-time = "2025-11-24T23:25:47.938Z" }, + { url = "https://files.pythonhosted.org/packages/2a/a6/59d0a146e61d20e18db7396583242e32e0f120693b67a8de43f1557033e2/asyncpg-0.31.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b44c31e1efc1c15188ef183f287c728e2046abb1d26af4d20858215d50d91fad", size = 662042, upload-time = "2025-11-24T23:25:49.578Z" }, + { url = "https://files.pythonhosted.org/packages/36/01/ffaa189dcb63a2471720615e60185c3f6327716fdc0fc04334436fbb7c65/asyncpg-0.31.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0c89ccf741c067614c9b5fc7f1fc6f3b61ab05ae4aaa966e6fd6b93097c7d20d", size = 638504, upload-time = "2025-11-24T23:25:51.501Z" }, + { url = "https://files.pythonhosted.org/packages/9f/62/3f699ba45d8bd24c5d65392190d19656d74ff0185f42e19d0bbd973bb371/asyncpg-0.31.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:12b3b2e39dc5470abd5e98c8d3373e4b1d1234d9fbdedf538798b2c13c64460a", size = 3426241, upload-time = "2025-11-24T23:25:53.278Z" }, + { url = "https://files.pythonhosted.org/packages/8c/d1/a867c2150f9c6e7af6462637f613ba67f78a314b00db220cd26ff559d532/asyncpg-0.31.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:aad7a33913fb8bcb5454313377cc330fbb19a0cd5faa7272407d8a0c4257b671", size = 3520321, upload-time = "2025-11-24T23:25:54.982Z" }, + { url = "https://files.pythonhosted.org/packages/7a/1a/cce4c3f246805ecd285a3591222a2611141f1669d002163abef999b60f98/asyncpg-0.31.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3df118d94f46d85b2e434fd62c84cb66d5834d5a890725fe625f498e72e4d5ec", size = 3316685, upload-time = "2025-11-24T23:25:57.43Z" }, + { url = "https://files.pythonhosted.org/packages/40/ae/0fc961179e78cc579e138fad6eb580448ecae64908f95b8cb8ee2f241f67/asyncpg-0.31.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bd5b6efff3c17c3202d4b37189969acf8927438a238c6257f66be3c426beba20", size = 3471858, upload-time = "2025-11-24T23:25:59.636Z" }, + { url = "https://files.pythonhosted.org/packages/52/b2/b20e09670be031afa4cbfabd645caece7f85ec62d69c312239de568e058e/asyncpg-0.31.0-cp312-cp312-win32.whl", hash = "sha256:027eaa61361ec735926566f995d959ade4796f6a49d3bde17e5134b9964f9ba8", size = 527852, upload-time = "2025-11-24T23:26:01.084Z" }, + { url = "https://files.pythonhosted.org/packages/b5/f0/f2ed1de154e15b107dc692262395b3c17fc34eafe2a78fc2115931561730/asyncpg-0.31.0-cp312-cp312-win_amd64.whl", hash = "sha256:72d6bdcbc93d608a1158f17932de2321f68b1a967a13e014998db87a72ed3186", size = 597175, upload-time = "2025-11-24T23:26:02.564Z" }, + { url = "https://files.pythonhosted.org/packages/95/11/97b5c2af72a5d0b9bc3fa30cd4b9ce22284a9a943a150fdc768763caf035/asyncpg-0.31.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c204fab1b91e08b0f47e90a75d1b3c62174dab21f670ad6c5d0f243a228f015b", size = 661111, upload-time = "2025-11-24T23:26:04.467Z" }, + { url = "https://files.pythonhosted.org/packages/1b/71/157d611c791a5e2d0423f09f027bd499935f0906e0c2a416ce712ba51ef3/asyncpg-0.31.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:54a64f91839ba59008eccf7aad2e93d6e3de688d796f35803235ea1c4898ae1e", size = 636928, upload-time = "2025-11-24T23:26:05.944Z" }, + { url = "https://files.pythonhosted.org/packages/2e/fc/9e3486fb2bbe69d4a867c0b76d68542650a7ff1574ca40e84c3111bb0c6e/asyncpg-0.31.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c0e0822b1038dc7253b337b0f3f676cadc4ac31b126c5d42691c39691962e403", size = 3424067, upload-time = "2025-11-24T23:26:07.957Z" }, + { url = "https://files.pythonhosted.org/packages/12/c6/8c9d076f73f07f995013c791e018a1cd5f31823c2a3187fc8581706aa00f/asyncpg-0.31.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bef056aa502ee34204c161c72ca1f3c274917596877f825968368b2c33f585f4", size = 3518156, upload-time = "2025-11-24T23:26:09.591Z" }, + { url = "https://files.pythonhosted.org/packages/ae/3b/60683a0baf50fbc546499cfb53132cb6835b92b529a05f6a81471ab60d0c/asyncpg-0.31.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0bfbcc5b7ffcd9b75ab1558f00db2ae07db9c80637ad1b2469c43df79d7a5ae2", size = 3319636, upload-time = "2025-11-24T23:26:11.168Z" }, + { url = "https://files.pythonhosted.org/packages/50/dc/8487df0f69bd398a61e1792b3cba0e47477f214eff085ba0efa7eac9ce87/asyncpg-0.31.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:22bc525ebbdc24d1261ecbf6f504998244d4e3be1721784b5f64664d61fbe602", size = 3472079, upload-time = "2025-11-24T23:26:13.164Z" }, + { url = "https://files.pythonhosted.org/packages/13/a1/c5bbeeb8531c05c89135cb8b28575ac2fac618bcb60119ee9696c3faf71c/asyncpg-0.31.0-cp313-cp313-win32.whl", hash = "sha256:f890de5e1e4f7e14023619399a471ce4b71f5418cd67a51853b9910fdfa73696", size = 527606, upload-time = "2025-11-24T23:26:14.78Z" }, + { url = "https://files.pythonhosted.org/packages/91/66/b25ccb84a246b470eb943b0107c07edcae51804912b824054b3413995a10/asyncpg-0.31.0-cp313-cp313-win_amd64.whl", hash = "sha256:dc5f2fa9916f292e5c5c8b2ac2813763bcd7f58e130055b4ad8a0531314201ab", size = 596569, upload-time = "2025-11-24T23:26:16.189Z" }, + { url = "https://files.pythonhosted.org/packages/3c/36/e9450d62e84a13aea6580c83a47a437f26c7ca6fa0f0fd40b6670793ea30/asyncpg-0.31.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:f6b56b91bb0ffc328c4e3ed113136cddd9deefdf5f79ab448598b9772831df44", size = 660867, upload-time = "2025-11-24T23:26:17.631Z" }, + { url = "https://files.pythonhosted.org/packages/82/4b/1d0a2b33b3102d210439338e1beea616a6122267c0df459ff0265cd5807a/asyncpg-0.31.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:334dec28cf20d7f5bb9e45b39546ddf247f8042a690bff9b9573d00086e69cb5", size = 638349, upload-time = "2025-11-24T23:26:19.689Z" }, + { url = "https://files.pythonhosted.org/packages/41/aa/e7f7ac9a7974f08eff9183e392b2d62516f90412686532d27e196c0f0eeb/asyncpg-0.31.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:98cc158c53f46de7bb677fd20c417e264fc02b36d901cc2a43bd6cb0dc6dbfd2", size = 3410428, upload-time = "2025-11-24T23:26:21.275Z" }, + { url = "https://files.pythonhosted.org/packages/6f/de/bf1b60de3dede5c2731e6788617a512bc0ebd9693eac297ee74086f101d7/asyncpg-0.31.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9322b563e2661a52e3cdbc93eed3be7748b289f792e0011cb2720d278b366ce2", size = 3471678, upload-time = "2025-11-24T23:26:23.627Z" }, + { url = "https://files.pythonhosted.org/packages/46/78/fc3ade003e22d8bd53aaf8f75f4be48f0b460fa73738f0391b9c856a9147/asyncpg-0.31.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:19857a358fc811d82227449b7ca40afb46e75b33eb8897240c3839dd8b744218", size = 3313505, upload-time = "2025-11-24T23:26:25.235Z" }, + { url = "https://files.pythonhosted.org/packages/bf/e9/73eb8a6789e927816f4705291be21f2225687bfa97321e40cd23055e903a/asyncpg-0.31.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ba5f8886e850882ff2c2ace5732300e99193823e8107e2c53ef01c1ebfa1e85d", size = 3434744, upload-time = "2025-11-24T23:26:26.944Z" }, + { url = "https://files.pythonhosted.org/packages/08/4b/f10b880534413c65c5b5862f79b8e81553a8f364e5238832ad4c0af71b7f/asyncpg-0.31.0-cp314-cp314-win32.whl", hash = "sha256:cea3a0b2a14f95834cee29432e4ddc399b95700eb1d51bbc5bfee8f31fa07b2b", size = 532251, upload-time = "2025-11-24T23:26:28.404Z" }, + { url = "https://files.pythonhosted.org/packages/d3/2d/7aa40750b7a19efa5d66e67fc06008ca0f27ba1bd082e457ad82f59aba49/asyncpg-0.31.0-cp314-cp314-win_amd64.whl", hash = "sha256:04d19392716af6b029411a0264d92093b6e5e8285ae97a39957b9a9c14ea72be", size = 604901, upload-time = "2025-11-24T23:26:30.34Z" }, + { url = "https://files.pythonhosted.org/packages/ce/fe/b9dfe349b83b9dee28cc42360d2c86b2cdce4cb551a2c2d27e156bcac84d/asyncpg-0.31.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:bdb957706da132e982cc6856bb2f7b740603472b54c3ebc77fe60ea3e57e1bd2", size = 702280, upload-time = "2025-11-24T23:26:32Z" }, + { url = "https://files.pythonhosted.org/packages/6a/81/e6be6e37e560bd91e6c23ea8a6138a04fd057b08cf63d3c5055c98e81c1d/asyncpg-0.31.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6d11b198111a72f47154fa03b85799f9be63701e068b43f84ac25da0bda9cb31", size = 682931, upload-time = "2025-11-24T23:26:33.572Z" }, + { url = "https://files.pythonhosted.org/packages/a6/45/6009040da85a1648dd5bc75b3b0a062081c483e75a1a29041ae63a0bf0dc/asyncpg-0.31.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:18c83b03bc0d1b23e6230f5bf8d4f217dc9bc08644ce0502a9d91dc9e634a9c7", size = 3581608, upload-time = "2025-11-24T23:26:35.638Z" }, + { url = "https://files.pythonhosted.org/packages/7e/06/2e3d4d7608b0b2b3adbee0d0bd6a2d29ca0fc4d8a78f8277df04e2d1fd7b/asyncpg-0.31.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e009abc333464ff18b8f6fd146addffd9aaf63e79aa3bb40ab7a4c332d0c5e9e", size = 3498738, upload-time = "2025-11-24T23:26:37.275Z" }, + { url = "https://files.pythonhosted.org/packages/7d/aa/7d75ede780033141c51d83577ea23236ba7d3a23593929b32b49db8ed36e/asyncpg-0.31.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3b1fbcb0e396a5ca435a8826a87e5c2c2cc0c8c68eb6fadf82168056b0e53a8c", size = 3401026, upload-time = "2025-11-24T23:26:39.423Z" }, + { url = "https://files.pythonhosted.org/packages/ba/7a/15e37d45e7f7c94facc1e9148c0e455e8f33c08f0b8a0b1deb2c5171771b/asyncpg-0.31.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8df714dba348efcc162d2adf02d213e5fab1bd9f557e1305633e851a61814a7a", size = 3429426, upload-time = "2025-11-24T23:26:41.032Z" }, + { url = "https://files.pythonhosted.org/packages/13/d5/71437c5f6ae5f307828710efbe62163974e71237d5d46ebd2869ea052d10/asyncpg-0.31.0-cp314-cp314t-win32.whl", hash = "sha256:1b41f1afb1033f2b44f3234993b15096ddc9cd71b21a42dbd87fc6a57b43d65d", size = 614495, upload-time = "2025-11-24T23:26:42.659Z" }, + { url = "https://files.pythonhosted.org/packages/3c/d7/8fb3044eaef08a310acfe23dae9a8e2e07d305edc29a53497e52bc76eca7/asyncpg-0.31.0-cp314-cp314t-win_amd64.whl", hash = "sha256:bd4107bb7cdd0e9e65fae66a62afd3a249663b844fa34d479f6d5b3bef9c04c3", size = 706062, upload-time = "2025-11-24T23:26:44.086Z" }, +] + [[package]] name = "attrs" version = "25.4.0" @@ -2858,6 +2906,7 @@ name = "tale-crawler" version = "0.1.0" source = { virtual = "." } dependencies = [ + { name = "asyncpg" }, { name = "beautifulsoup4" }, { name = "crawl4ai" }, { name = "fastapi" }, @@ -2872,6 +2921,7 @@ dependencies = [ { name = "python-dotenv" }, { name = "python-multipart" }, { name = "python-pptx" }, + { name = "tiktoken" }, { name = "uvicorn", extra = ["standard"] }, ] @@ -2886,6 +2936,7 @@ dev = [ [package.metadata] requires-dist = [ + { name = "asyncpg", specifier = ">=0.30.0" }, { name = "beautifulsoup4", specifier = "==4.14.3" }, { name = "crawl4ai", specifier = "==0.8.0" }, { name = "fastapi", specifier = "==0.133.1" }, @@ -2905,6 +2956,7 @@ requires-dist = [ { name = "python-multipart", specifier = "==0.0.20" }, { name = "python-pptx", specifier = "==1.0.2" }, { name = "ruff", marker = "extra == 'dev'", specifier = "==0.15.4" }, + { name = "tiktoken", specifier = ">=0.9.0" }, { name = "uvicorn", extras = ["standard"], specifier = "==0.41.0" }, ] provides-extras = ["dev"] diff --git a/services/db/Dockerfile b/services/db/Dockerfile index 0b4dc40a41..addbbb75a1 100644 --- a/services/db/Dockerfile +++ b/services/db/Dockerfile @@ -1,12 +1,13 @@ -# Dockerfile for Tale DB (TimescaleDB) +# Dockerfile for Tale DB (ParadeDB) # Supports AMD64 and ARM64 architectures +# +# Base: ParadeDB (pg_search BM25 + pgvector) # Version argument - injected by CI from git tag, defaults to 'dev' for local builds ARG VERSION=dev -# Use official TimescaleDB image as base -# TimescaleDB is PostgreSQL with time-series extensions -FROM timescale/timescaledb:2.25.1-pg16 +# ParadeDB includes pg_search (BM25), pgvector, and PostgreSQL 16 +FROM paradedb/paradedb:v0.21.9-pg16 # Re-declare VERSION arg (ARGs don't persist after FROM) ARG VERSION=dev @@ -14,17 +15,22 @@ ARG VERSION=dev # Switch to root for all setup operations USER root -# Install additional tools, create directories, and set up configuration in one layer -# Note: TimescaleDB image is based on Alpine Linux, so we use apk instead of apt-get -RUN apk add --no-cache \ - curl \ - ca-certificates \ +# Install additional tools and create required directories +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + curl \ + ca-certificates \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ && mkdir -p /docker-entrypoint-initdb.d \ /etc/postgresql/conf.d \ /var/lib/postgresql/backup # Copy initialization scripts +# - /docker-entrypoint-initdb.d/ : PostgreSQL runs these on first init only +# - /etc/postgresql/init-scripts/ : Entrypoint wrapper runs these on every startup COPY services/db/init-scripts/ /docker-entrypoint-initdb.d/ +COPY services/db/init-scripts/ /etc/postgresql/init-scripts/ # Copy custom PostgreSQL configuration to the expected location COPY services/db/postgresql.conf /etc/postgresql/postgresql.conf @@ -36,8 +42,9 @@ RUN chmod +x /usr/local/bin/docker-entrypoint-wrapper.sh \ /etc/postgresql \ /var/lib/postgresql/backup -# Switch to postgres user for runtime security -USER postgres +# Run as root — the standard docker-entrypoint.sh detects root, +# fixes data directory ownership, then re-execs as postgres via gosu. +# This handles UID mismatches when switching base images (e.g., Alpine→Debian). # Set environment variables with DB_ prefix defaults # These can be overridden at runtime @@ -53,11 +60,8 @@ ENV TALE_VERSION=${VERSION} \ DB_MAX_CONNECTIONS=100 \ DB_SHARED_BUFFERS=256MB \ DB_EFFECTIVE_CACHE_SIZE=1GB \ - DB_MAINTENANCE_WORK_MEM=64MB \ - DB_WORK_MEM=4MB \ - # TimescaleDB specific - DB_TIMESCALEDB_TELEMETRY=off \ - TIMESCALEDB_TELEMETRY=off \ + DB_MAINTENANCE_WORK_MEM=128MB \ + DB_WORK_MEM=32MB \ # Logging DB_LOG_STATEMENT=none \ DB_LOG_MIN_DURATION_STATEMENT=-1 diff --git a/services/db/docker-entrypoint-wrapper.sh b/services/db/docker-entrypoint-wrapper.sh index 9b636ba90c..806f6a5102 100644 --- a/services/db/docker-entrypoint-wrapper.sh +++ b/services/db/docker-entrypoint-wrapper.sh @@ -3,7 +3,7 @@ set -e # Tale DB Entrypoint Wrapper # This script maps DB_ prefixed environment variables to PostgreSQL configuration -# and then calls the original TimescaleDB entrypoint +# and then calls the original PostgreSQL entrypoint # ============================================================================ # Map DB_ environment variables to PostgreSQL standard variables @@ -42,12 +42,6 @@ if [ -n "$DB_WORK_MEM" ]; then POSTGRES_ARGS+=("-c" "work_mem=${DB_WORK_MEM}") fi -# TimescaleDB settings -if [ -n "$DB_TIMESCALEDB_TELEMETRY" ]; then - POSTGRES_ARGS+=("-c" "timescaledb.telemetry_level=${DB_TIMESCALEDB_TELEMETRY}") - export TIMESCALEDB_TELEMETRY="${DB_TIMESCALEDB_TELEMETRY}" -fi - # Logging settings if [ -n "$DB_LOG_STATEMENT" ]; then POSTGRES_ARGS+=("-c" "log_statement=${DB_LOG_STATEMENT}") @@ -73,11 +67,37 @@ echo "User: ${POSTGRES_USER}" echo "Max Connections: ${DB_MAX_CONNECTIONS:-100}" echo "Shared Buffers: ${DB_SHARED_BUFFERS:-256MB}" echo "Effective Cache Size: ${DB_EFFECTIVE_CACHE_SIZE:-1GB}" -echo "TimescaleDB Telemetry: ${DB_TIMESCALEDB_TELEMETRY:-off}" echo "==================================================" # ============================================================================ -# Call the original TimescaleDB/PostgreSQL entrypoint +# Post-start init scripts (idempotent, run on every startup) +# ============================================================================ +# All init scripts use IF NOT EXISTS / CREATE OR REPLACE / DROP IF EXISTS +# so they are safe to re-run. This ensures schema, extensions, and indexes +# converge to the desired state on every container start — not just first init. + +INIT_SCRIPTS_DIR="/etc/postgresql/init-scripts" + +run_init_scripts() { + echo "Running init scripts..." + for script in "$INIT_SCRIPTS_DIR"/*.sql; do + [ -f "$script" ] || continue + echo " $(basename "$script")" + psql -U "$POSTGRES_USER" -d "$POSTGRES_DB" -f "$script" 2>&1 | grep -E "^(ERROR|NOTICE)" || true + done + echo "Init scripts complete." +} + +# Run init scripts in the background after PostgreSQL starts +( + until pg_isready -U "$POSTGRES_USER" -q 2>/dev/null; do + sleep 1 + done + run_init_scripts +) & + +# ============================================================================ +# Call the original PostgreSQL entrypoint # ============================================================================ exec docker-entrypoint.sh "$@" "${POSTGRES_ARGS[@]}" diff --git a/services/db/init-scripts/01-init-extensions.sql b/services/db/init-scripts/01-init-extensions.sql new file mode 100644 index 0000000000..064e9eba18 --- /dev/null +++ b/services/db/init-scripts/01-init-extensions.sql @@ -0,0 +1,24 @@ +-- Tale DB: Core extensions and schema setup +-- Idempotent: safe to run on every startup + +-- Remove legacy TimescaleDB extension if present +DROP EXTENSION IF EXISTS timescaledb CASCADE; + +-- Enable core extensions +CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; +CREATE EXTENSION IF NOT EXISTS "pg_stat_statements"; +CREATE EXTENSION IF NOT EXISTS "pgcrypto"; + +-- Create tale schema +CREATE SCHEMA IF NOT EXISTS tale; + +-- Set search path +DO $$ +BEGIN + EXECUTE format('ALTER DATABASE %I SET search_path TO tale, public', current_database()); +END $$; + +-- Grant permissions +GRANT ALL PRIVILEGES ON SCHEMA tale TO CURRENT_USER; +GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA tale TO CURRENT_USER; +GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA tale TO CURRENT_USER; diff --git a/services/db/init-scripts/01-init-timescaledb.sql b/services/db/init-scripts/01-init-timescaledb.sql deleted file mode 100644 index 42ac7404c2..0000000000 --- a/services/db/init-scripts/01-init-timescaledb.sql +++ /dev/null @@ -1,85 +0,0 @@ --- Tale DB Initialization Script --- This script sets up the TimescaleDB extension and creates initial schema - --- Enable TimescaleDB extension -CREATE EXTENSION IF NOT EXISTS timescaledb; - --- Enable additional useful extensions -CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; -- UUID generation -CREATE EXTENSION IF NOT EXISTS "pg_stat_statements"; -- Query performance monitoring -CREATE EXTENSION IF NOT EXISTS "pgcrypto"; -- Cryptographic functions - --- Create schema for Tale application -CREATE SCHEMA IF NOT EXISTS tale; - --- Set search path to include tale schema for the current database -DO $$ -BEGIN - EXECUTE format('ALTER DATABASE %I SET search_path TO tale, public', current_database()); -END $$; - --- Grant permissions to the current user (entrypoint runs scripts as POSTGRES_USER) -GRANT ALL PRIVILEGES ON SCHEMA tale TO CURRENT_USER; -GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA tale TO CURRENT_USER; -GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA tale TO CURRENT_USER; - --- Create a sample time-series table (can be customized based on needs) -CREATE TABLE IF NOT EXISTS tale.metrics ( - time TIMESTAMPTZ NOT NULL, - metric_name TEXT NOT NULL, - value DOUBLE PRECISION, - tags JSONB, - metadata JSONB -); - --- Convert to hypertable for time-series optimization -SELECT create_hypertable('tale.metrics', 'time', if_not_exists => TRUE); - --- Create indexes for common queries -CREATE INDEX IF NOT EXISTS idx_metrics_name_time ON tale.metrics (metric_name, time DESC); -CREATE INDEX IF NOT EXISTS idx_metrics_tags ON tale.metrics USING GIN (tags); - --- Create a sample events table -CREATE TABLE IF NOT EXISTS tale.events ( - time TIMESTAMPTZ NOT NULL, - event_type TEXT NOT NULL, - user_id UUID, - session_id UUID, - properties JSONB, - metadata JSONB -); - --- Convert to hypertable -SELECT create_hypertable('tale.events', 'time', if_not_exists => TRUE); - --- Create indexes -CREATE INDEX IF NOT EXISTS idx_events_type_time ON tale.events (event_type, time DESC); -CREATE INDEX IF NOT EXISTS idx_events_user ON tale.events (user_id, time DESC); -CREATE INDEX IF NOT EXISTS idx_events_session ON tale.events (session_id, time DESC); -CREATE INDEX IF NOT EXISTS idx_events_properties ON tale.events USING GIN (properties); - --- Create retention policies (optional - adjust based on needs) --- Automatically drop data older than 90 days --- SELECT add_retention_policy('tale.metrics', INTERVAL '90 days', if_not_exists => TRUE); --- SELECT add_retention_policy('tale.events', INTERVAL '90 days', if_not_exists => TRUE); - --- Create continuous aggregates for common queries (optional) --- Example: hourly metrics rollup --- CREATE MATERIALIZED VIEW tale.metrics_hourly --- WITH (timescaledb.continuous) AS --- SELECT --- time_bucket('1 hour', time) AS bucket, --- metric_name, --- AVG(value) as avg_value, --- MAX(value) as max_value, --- MIN(value) as min_value, --- COUNT(*) as count --- FROM tale.metrics --- GROUP BY bucket, metric_name; - --- Log successful initialization -DO $$ -BEGIN - RAISE NOTICE 'Tale DB initialized successfully with TimescaleDB'; -END $$; - diff --git a/services/db/init-scripts/02-create-convex-database.sql b/services/db/init-scripts/02-create-convex-database.sql index 3ffe871958..2fcb8d0647 100644 --- a/services/db/init-scripts/02-create-convex-database.sql +++ b/services/db/init-scripts/02-create-convex-database.sql @@ -1,23 +1,14 @@ --- ============================================================================ --- Create Convex Self-Hosted Database --- ============================================================================ --- This script creates the database required by Convex self-hosted backend. --- The database name is HARDCODED to tale_platform for safety and consistency. --- ============================================================================ +-- Tale DB: Convex self-hosted database +-- Idempotent: safe to run on every startup --- Create the Convex database (hardcoded name) -CREATE DATABASE tale_platform; +SELECT 'CREATE DATABASE tale_platform' +WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'tale_platform') +\gexec --- Grant privileges to the tale user GRANT ALL PRIVILEGES ON DATABASE tale_platform TO tale; --- Connect to the new database \c tale_platform --- Enable required extensions +DROP EXTENSION IF EXISTS timescaledb CASCADE; CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; CREATE EXTENSION IF NOT EXISTS "pg_trgm"; - --- Log completion -\echo 'Convex database created successfully: tale_platform' - diff --git a/services/db/init-scripts/03-create-rag-database.sql b/services/db/init-scripts/03-create-rag-database.sql index b5bc4a1e4a..b8664e84f9 100644 --- a/services/db/init-scripts/03-create-rag-database.sql +++ b/services/db/init-scripts/03-create-rag-database.sql @@ -1,34 +1,19 @@ --- ============================================================================ --- Create RAG (Cognee) Database --- ============================================================================ --- This script creates the database required by the RAG service (Cognee). --- The database is dedicated to RAG to allow safe full-database resets without --- affecting other services (e.g., Convex uses tale_platform). --- ============================================================================ +-- Tale DB: RAG (Cognee) database +-- Idempotent: safe to run on every startup --- Create the RAG database (hardcoded name for safety) -CREATE DATABASE tale_rag; +SELECT 'CREATE DATABASE tale_rag' +WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'tale_rag') +\gexec --- Grant privileges to the tale user GRANT ALL PRIVILEGES ON DATABASE tale_rag TO tale; --- Connect to the new database \c tale_rag --- Enable required extensions for Cognee/PGVector +DROP EXTENSION IF EXISTS timescaledb CASCADE; CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; CREATE EXTENSION IF NOT EXISTS "vector"; --- ============================================================================ --- HNSW Index Management for PGVector --- ============================================================================ --- Cognee creates vector tables dynamically (one per collection/dataset). --- This function creates HNSW indexes on vector columns for fast similarity search. --- Without indexes, queries on 200k+ vectors can take 5-15 seconds. --- With HNSW indexes, queries complete in <500ms. --- ============================================================================ - --- Function to create HNSW indexes on all vector columns that don't have one +-- Dynamic HNSW index creation for Cognee's vector tables CREATE OR REPLACE FUNCTION create_vector_hnsw_indexes() RETURNS void AS $$ DECLARE @@ -36,50 +21,29 @@ DECLARE index_name TEXT; index_exists BOOLEAN; BEGIN - -- Find all columns with vector type in public schema FOR rec IN - SELECT - c.table_name, - c.column_name, - c.udt_name + SELECT c.table_name, c.column_name FROM information_schema.columns c JOIN information_schema.tables t - ON c.table_name = t.table_name - AND c.table_schema = t.table_schema + ON c.table_name = t.table_name AND c.table_schema = t.table_schema WHERE c.table_schema = 'public' AND c.udt_name = 'vector' AND t.table_type = 'BASE TABLE' LOOP - -- Generate index name index_name := rec.table_name || '_' || rec.column_name || '_hnsw_idx'; - -- Check if index already exists SELECT EXISTS ( SELECT 1 FROM pg_indexes - WHERE schemaname = 'public' - AND indexname = index_name + WHERE schemaname = 'public' AND indexname = index_name ) INTO index_exists; - -- Create index if it doesn't exist IF NOT index_exists THEN - RAISE NOTICE 'Creating HNSW index: % on %.%', - index_name, rec.table_name, rec.column_name; - - -- Use cosine distance operator (most common for embeddings) - -- m=16, ef_construction=64 are good defaults for quality/speed balance EXECUTE format( 'CREATE INDEX %I ON %I USING hnsw (%I vector_cosine_ops) WITH (m = 16, ef_construction = 64)', - index_name, - rec.table_name, - rec.column_name + index_name, rec.table_name, rec.column_name ); - RAISE NOTICE 'Created HNSW index: %', index_name; END IF; END LOOP; END; $$ LANGUAGE plpgsql; - --- Log completion -\echo 'RAG database created successfully: tale_rag' -\echo 'HNSW index function created: SELECT create_vector_hnsw_indexes();' diff --git a/services/db/init-scripts/04-create-search-database.sql b/services/db/init-scripts/04-create-search-database.sql new file mode 100644 index 0000000000..89776afaff --- /dev/null +++ b/services/db/init-scripts/04-create-search-database.sql @@ -0,0 +1,111 @@ +-- Tale DB: Crawler search database (pgvector + pg_search BM25) +-- Idempotent: safe to run on every startup + +SELECT 'CREATE DATABASE tale_crawler_search' +WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'tale_crawler_search') +\gexec + +GRANT ALL PRIVILEGES ON DATABASE tale_crawler_search TO tale; + +\c tale_crawler_search + +DROP EXTENSION IF EXISTS timescaledb CASCADE; +CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; +CREATE EXTENSION IF NOT EXISTS "vector"; +CREATE EXTENSION IF NOT EXISTS "pg_search"; + +-- ============================================================================ +-- Websites +-- ============================================================================ + +CREATE TABLE IF NOT EXISTS websites ( + domain TEXT PRIMARY KEY, + title TEXT, + description TEXT, + page_count INTEGER NOT NULL DEFAULT 0, + status TEXT NOT NULL DEFAULT 'idle', + scan_interval INTEGER NOT NULL DEFAULT 21600, + last_scanned_at TIMESTAMPTZ, + error TEXT, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_websites_status ON websites(status); +CREATE INDEX IF NOT EXISTS idx_websites_due ON websites(status, last_scanned_at); + +-- ============================================================================ +-- Website URLs +-- ============================================================================ + +CREATE TABLE IF NOT EXISTS website_urls ( + id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, + domain TEXT NOT NULL REFERENCES websites(domain) ON DELETE CASCADE, + url TEXT NOT NULL, + content_hash TEXT, + status TEXT NOT NULL DEFAULT 'discovered', + last_crawled_at TIMESTAMPTZ, + discovered_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + title TEXT, + content TEXT, + word_count INTEGER, + metadata JSONB, + structured_data JSONB, + fail_count INTEGER NOT NULL DEFAULT 0, + etag TEXT, + last_modified TEXT, + UNIQUE(domain, url) +); + +CREATE INDEX IF NOT EXISTS idx_website_urls_domain ON website_urls(domain); +CREATE INDEX IF NOT EXISTS idx_website_urls_domain_status ON website_urls(domain, status); +CREATE INDEX IF NOT EXISTS idx_website_urls_crawl_order ON website_urls(domain, last_crawled_at NULLS FIRST); + +-- ============================================================================ +-- Chunks (search index) +-- ============================================================================ + +CREATE TABLE IF NOT EXISTS chunks ( + id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, + domain TEXT NOT NULL, + url TEXT NOT NULL, + title TEXT, + content_hash TEXT NOT NULL, + chunk_index INTEGER NOT NULL, + chunk_content TEXT NOT NULL, + embedding vector, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + UNIQUE(url, chunk_index), + FOREIGN KEY (domain, url) REFERENCES website_urls(domain, url) ON DELETE CASCADE +); + +CREATE INDEX IF NOT EXISTS idx_chunks_domain ON chunks(domain); +CREATE INDEX IF NOT EXISTS idx_chunks_url ON chunks(url); +CREATE INDEX IF NOT EXISTS idx_chunks_url_content_hash ON chunks(url, content_hash); + +-- BM25 full-text index (ParadeDB pg_search) +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_indexes WHERE indexname = 'idx_chunks_bm25') THEN + CREATE INDEX idx_chunks_bm25 ON chunks + USING bm25 (id, chunk_content) + WITH (key_field='id'); + END IF; +EXCEPTION WHEN OTHERS THEN + RAISE NOTICE 'BM25 index deferred: %', SQLERRM; +END; +$$; + +-- Dynamic HNSW index (vector dimensions are configurable) +CREATE OR REPLACE FUNCTION create_chunks_hnsw_index() +RETURNS void AS $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_indexes + WHERE tablename = 'chunks' AND indexname = 'idx_chunks_embedding_hnsw' + ) THEN + EXECUTE 'CREATE INDEX idx_chunks_embedding_hnsw ON chunks USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64)'; + RAISE NOTICE 'Created HNSW index on chunks.embedding'; + END IF; +END; +$$ LANGUAGE plpgsql; diff --git a/services/db/postgresql.conf b/services/db/postgresql.conf index 1a9e36934a..fdff07ab33 100644 --- a/services/db/postgresql.conf +++ b/services/db/postgresql.conf @@ -1,5 +1,5 @@ # Tale DB PostgreSQL Configuration -# Custom configuration for TimescaleDB optimized for Tale platform +# Custom configuration for ParadeDB (pg_search + pgvector) optimized for Tale platform # This file is loaded in addition to the default PostgreSQL configuration # ============================================================================ @@ -18,20 +18,20 @@ listen_addresses = '*' # Shared memory for caching data # Controlled by DB_SHARED_BUFFERS environment variable # Recommended: 25% of system RAM for dedicated DB server -# shared_buffers = 256MB +shared_buffers = 256MB # Memory for maintenance operations (VACUUM, CREATE INDEX, etc.) # Controlled by DB_MAINTENANCE_WORK_MEM environment variable -# maintenance_work_mem = 64MB +maintenance_work_mem = 128MB # Memory for query operations (sorts, hash tables) # Controlled by DB_WORK_MEM environment variable -# work_mem = 4MB +work_mem = 32MB # Estimate of memory available for disk caching # Controlled by DB_EFFECTIVE_CACHE_SIZE environment variable # Recommended: 50-75% of system RAM -# effective_cache_size = 1GB +effective_cache_size = 1GB # ============================================================================ # Write-Ahead Log (WAL) Settings @@ -82,21 +82,11 @@ log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h ' log_connections = on log_disconnections = on -# ============================================================================ -# TimescaleDB Settings -# ============================================================================ -# Disable telemetry -# Controlled by DB_TIMESCALEDB_TELEMETRY environment variable -# timescaledb.telemetry_level = off - -# TimescaleDB background workers -timescaledb.max_background_workers = 8 - # ============================================================================ # Statistics # ============================================================================ # Enable query statistics collection -shared_preload_libraries = 'timescaledb,pg_stat_statements' +shared_preload_libraries = 'pg_search,pg_stat_statements' # Track query statistics pg_stat_statements.track = all diff --git a/services/platform/app/features/automations/utils/step-icons.tsx b/services/platform/app/features/automations/utils/step-icons.tsx index 0f21e28e13..69cda0215f 100644 --- a/services/platform/app/features/automations/utils/step-icons.tsx +++ b/services/platform/app/features/automations/utils/step-icons.tsx @@ -21,7 +21,6 @@ import { CheckCircle, Cloud, Globe, - Layout, GitBranch, Settings, } from 'lucide-react'; @@ -42,7 +41,6 @@ const ACTION_ICON_MAP: Record = { onedrive: Cloud, crawler: Globe, website: Globe, - websitePages: Layout, workflow: GitBranch, }; diff --git a/services/platform/app/features/websites/components/website-pages-dialog.tsx b/services/platform/app/features/websites/components/website-pages-dialog.tsx index 408192ebb3..187d5ae306 100644 --- a/services/platform/app/features/websites/components/website-pages-dialog.tsx +++ b/services/platform/app/features/websites/components/website-pages-dialog.tsx @@ -1,12 +1,10 @@ 'use client'; -import type { Components } from 'react-markdown'; - import { FileText } from 'lucide-react'; -import { useCallback, useRef, useState } from 'react'; -import ReactMarkdown from 'react-markdown'; +import { useCallback, useEffect, useState } from 'react'; import type { Id } from '@/convex/_generated/dataModel'; +import type { CrawlerPage } from '@/convex/websites/types'; import { ViewDialog } from '@/app/components/ui/dialog/view-dialog'; import { EmptyState } from '@/app/components/ui/feedback/empty-state'; @@ -15,19 +13,12 @@ import { BorderedSection } from '@/app/components/ui/layout/bordered-section'; import { Button } from '@/app/components/ui/primitives/button'; import { Heading } from '@/app/components/ui/typography/heading'; import { Text } from '@/app/components/ui/typography/text'; -import { - markdownComponents, - markdownWrapperStyles, -} from '@/app/features/chat/components/message-bubble/markdown-renderer'; +import { useConvexAction } from '@/app/hooks/use-convex-action'; +import { useFormatDate } from '@/app/hooks/use-format-date'; +import { api } from '@/convex/_generated/api'; import { useT } from '@/lib/i18n/client'; -import { cn } from '@/lib/utils/cn'; - -import { useListWebsitePagesPaginated } from '../hooks/queries'; - -// oxlint-disable-next-line typescript/no-unsafe-type-assertion -- markdownComponents are structurally compatible with react-markdown Components; index signature mismatch is a React type version conflict -const mdComponents = markdownComponents as unknown as Components; -const COLLAPSED_MAX_HEIGHT = 256; +const PAGE_SIZE = 20; interface WebsitePagesDialogProps { isOpen: boolean; @@ -42,50 +33,6 @@ function PageSkeleton() { - - - ); -} - -function CollapsibleMarkdown({ content }: { content: string }) { - const { t } = useT('websites'); - const [isExpanded, setIsExpanded] = useState(false); - const [isOverflowing, setIsOverflowing] = useState(false); - const contentRef = useRef(null); - - const measureRef = useCallback((node: HTMLDivElement | null) => { - if (node) { - contentRef.current = node; - setIsOverflowing(node.scrollHeight > COLLAPSED_MAX_HEIGHT); - } - }, []); - - return ( -
-
- {content} -
- {isOverflowing && ( - - )}
); } @@ -97,17 +44,45 @@ export function WebsitePagesDialog({ websiteDomain, }: WebsitePagesDialogProps) { const { t } = useT('websites'); - - const { results, status, loadMore, isLoading } = useListWebsitePagesPaginated( + const { formatDate } = useFormatDate(); + const [pages, setPages] = useState([]); + const [hasMore, setHasMore] = useState(false); + const [offset, setOffset] = useState(0); + const [isFirstLoad, setIsFirstLoad] = useState(true); + + const { mutate: fetchPages, isPending } = useConvexAction( + api.websites.actions.fetchPages, { - websiteId, - initialNumItems: 10, + onSuccess: (data) => { + if (data.offset === 0) { + setPages(data.pages); + } else { + setPages((prev) => [...prev, ...data.pages]); + } + setHasMore(data.hasMore); + setIsFirstLoad(false); + }, + onError: () => { + setIsFirstLoad(false); + }, }, ); - const isDone = status === 'Exhausted'; - const isLoadingMore = status === 'LoadingMore'; - const isLoadingFirst = status === 'LoadingFirstPage'; + useEffect(() => { + if (isOpen) { + setPages([]); + setOffset(0); + setHasMore(false); + setIsFirstLoad(true); + fetchPages({ websiteId, offset: 0, limit: PAGE_SIZE }); + } + }, [isOpen, websiteId, fetchPages]); + + const loadMore = useCallback(() => { + const nextOffset = offset + PAGE_SIZE; + setOffset(nextOffset); + fetchPages({ websiteId, offset: nextOffset, limit: PAGE_SIZE }); + }, [offset, websiteId, fetchPages]); return (
- {isLoadingFirst && ( + {isFirstLoad && isPending && ( <> @@ -125,35 +100,54 @@ export function WebsitePagesDialog({ )} - {!isLoadingFirst && results.length === 0 && ( + {!isFirstLoad && pages.length === 0 && ( )} - {results.map((page) => ( - + {pages.map((page) => ( + {page.title || page.url} - {page.title && {page.url}} - {page.content ? ( - - ) : ( - - {t('pagesDialog.noContent')} + {page.title && ( + + + {page.url} + )} +
+ + {t('pagesDialog.wordCount', { count: page.word_count })} + + + {t('pagesDialog.chunks', { count: page.chunks_count })} + + {page.last_crawled_at && ( + + {t('pagesDialog.lastCrawled', { + date: formatDate(page.last_crawled_at), + })} + + )} +
))} - {!isDone && !isLoadingFirst && ( + {hasMore && (
)} diff --git a/services/platform/app/features/websites/components/websites-table.tsx b/services/platform/app/features/websites/components/websites-table.tsx index e9ba2734b5..3c0fcb315b 100644 --- a/services/platform/app/features/websites/components/websites-table.tsx +++ b/services/platform/app/features/websites/components/websites-table.tsx @@ -2,7 +2,7 @@ import { useNavigate } from '@tanstack/react-router'; import { Globe } from 'lucide-react'; -import { useCallback, useMemo } from 'react'; +import { useCallback, useEffect, useMemo } from 'react'; import type { Doc } from '@/convex/_generated/dataModel'; @@ -10,6 +10,7 @@ import { DataTable } from '@/app/components/ui/data-table/data-table'; import { useListPage } from '@/app/hooks/use-list-page'; import { useT } from '@/lib/i18n/client'; +import { useSyncWebsiteStatuses } from '../hooks/mutations'; import { useApproxWebsiteCount, useListWebsitesPaginated, @@ -31,6 +32,16 @@ export function WebsitesTable({ organizationId, status }: WebsitesTableProps) { const { t: tWebsites } = useT('websites'); const { data: count } = useApproxWebsiteCount(organizationId); + const { mutate: syncStatuses } = useSyncWebsiteStatuses(); + + useEffect(() => { + const key = `websites-sync-${organizationId}`; + const lastSync = sessionStorage.getItem(key); + const fiveMinutes = 5 * 60 * 1000; + if (lastSync && Date.now() - Number(lastSync) < fiveMinutes) return; + sessionStorage.setItem(key, String(Date.now())); + syncStatuses({ organizationId }); + }, [organizationId, syncStatuses]); const { columns, searchPlaceholder, stickyLayout, pageSize } = useWebsitesTableConfig(); const paginatedResult = useListWebsitesPaginated({ @@ -67,8 +78,9 @@ export function WebsitesTable({ organizationId, status }: WebsitesTableProps) { key: 'status', title: tTables('headers.status'), options: [ - { value: 'active', label: tWebsites('filter.status.active') }, + { value: 'idle', label: tWebsites('filter.status.idle') }, { value: 'scanning', label: tWebsites('filter.status.scanning') }, + { value: 'active', label: tWebsites('filter.status.active') }, { value: 'error', label: tWebsites('filter.status.error') }, ], selectedValues: status ? [status] : [], diff --git a/services/platform/app/features/websites/hooks/mutations.ts b/services/platform/app/features/websites/hooks/mutations.ts index 1834b7a003..4119250b87 100644 --- a/services/platform/app/features/websites/hooks/mutations.ts +++ b/services/platform/app/features/websites/hooks/mutations.ts @@ -1,18 +1,23 @@ +import { useConvexAction } from '@/app/hooks/use-convex-action'; import { useConvexMutation } from '@/app/hooks/use-convex-mutation'; import { api } from '@/convex/_generated/api'; -export function useRescanWebsite() { - return useConvexMutation(api.websites.mutations.rescanWebsite); -} - export function useCreateWebsite() { - return useConvexMutation(api.websites.mutations.createWebsite); + return useConvexAction(api.websites.actions.createWebsite); } export function useDeleteWebsite() { - return useConvexMutation(api.websites.mutations.deleteWebsite); + return useConvexAction(api.websites.actions.deleteWebsite); +} + +export function useRescanWebsite() { + return useConvexAction(api.websites.actions.rescanWebsite); } export function useUpdateWebsite() { return useConvexMutation(api.websites.mutations.updateWebsite); } + +export function useSyncWebsiteStatuses() { + return useConvexAction(api.websites.actions.syncStatuses); +} diff --git a/services/platform/app/features/websites/hooks/queries.ts b/services/platform/app/features/websites/hooks/queries.ts index 6a94aacae6..e1dbb5dc35 100644 --- a/services/platform/app/features/websites/hooks/queries.ts +++ b/services/platform/app/features/websites/hooks/queries.ts @@ -1,4 +1,3 @@ -import type { Id } from '@/convex/_generated/dataModel'; import type { ConvexItemOf } from '@/lib/types/convex-helpers'; import { useCachedPaginatedQuery } from '@/app/hooks/use-cached-paginated-query'; @@ -39,19 +38,3 @@ export function useListWebsitesPaginated(args: ListWebsitesPaginatedArgs) { { initialNumItems }, ); } - -interface ListWebsitePagesPaginatedArgs { - websiteId: Id<'websites'>; - initialNumItems: number; -} - -export function useListWebsitePagesPaginated( - args: ListWebsitePagesPaginatedArgs, -) { - const { initialNumItems, ...queryArgs } = args; - return useCachedPaginatedQuery( - api.websites.queries.listWebsitePagesPaginated, - queryArgs, - { initialNumItems }, - ); -} diff --git a/services/platform/convex/_generated/api.d.ts b/services/platform/convex/_generated/api.d.ts index 5a863547bc..9b5a261da7 100644 --- a/services/platform/convex/_generated/api.d.ts +++ b/services/platform/convex/_generated/api.d.ts @@ -306,7 +306,6 @@ import type * as lib_crypto_get_secret_key from "../lib/crypto/get_secret_key.js import type * as lib_crypto_hex_to_bytes from "../lib/crypto/hex_to_bytes.js"; import type * as lib_crypto_internal_actions from "../lib/crypto/internal_actions.js"; import type * as lib_debug_log from "../lib/debug_log.js"; -import type * as lib_embedding_config from "../lib/embedding_config.js"; import type * as lib_error_classification from "../lib/error_classification.js"; import type * as lib_get_or_throw from "../lib/get_or_throw.js"; import type * as lib_get_user_teams from "../lib/get_user_teams.js"; @@ -450,7 +449,6 @@ import type * as predefined_workflows_product_recommendation_email from "../pred import type * as predefined_workflows_product_relationship_analysis from "../predefined_workflows/product_relationship_analysis.js"; import type * as predefined_workflows_shopify_sync_customers from "../predefined_workflows/shopify_sync_customers.js"; import type * as predefined_workflows_shopify_sync_products from "../predefined_workflows/shopify_sync_products.js"; -import type * as predefined_workflows_website_scan from "../predefined_workflows/website_scan.js"; import type * as predefined_workflows_workflow_rag_sync from "../predefined_workflows/workflow_rag_sync.js"; import type * as products_create_product from "../products/create_product.js"; import type * as products_create_product_with_translations from "../products/create_product_with_translations.js"; @@ -547,20 +545,10 @@ import type * as vendors_list_vendors_paginated from "../vendors/list_vendors_pa import type * as vendors_mutations from "../vendors/mutations.js"; import type * as vendors_queries from "../vendors/queries.js"; import type * as vendors_validators from "../vendors/validators.js"; -import type * as website_page_embeddings_chunk_content from "../website_page_embeddings/chunk_content.js"; -import type * as website_page_embeddings_content_hash from "../website_page_embeddings/content_hash.js"; -import type * as website_page_embeddings_embedding_pool from "../website_page_embeddings/embedding_pool.js"; -import type * as website_page_embeddings_internal_actions from "../website_page_embeddings/internal_actions.js"; -import type * as website_page_embeddings_internal_mutations from "../website_page_embeddings/internal_mutations.js"; -import type * as website_page_embeddings_internal_queries from "../website_page_embeddings/internal_queries.js"; -import type * as website_page_embeddings_rrf from "../website_page_embeddings/rrf.js"; +import type * as websites_actions from "../websites/actions.js"; import type * as websites_bulk_create_websites from "../websites/bulk_create_websites.js"; -import type * as websites_bulk_upsert_pages from "../websites/bulk_upsert_pages.js"; -import type * as websites_cleanup_website from "../websites/cleanup_website.js"; import type * as websites_create_website from "../websites/create_website.js"; import type * as websites_delete_website from "../websites/delete_website.js"; -import type * as websites_get_page_by_url from "../websites/get_page_by_url.js"; -import type * as websites_get_pages_by_website from "../websites/get_pages_by_website.js"; import type * as websites_get_website from "../websites/get_website.js"; import type * as websites_get_website_by_domain from "../websites/get_website_by_domain.js"; import type * as websites_get_websites from "../websites/get_websites.js"; @@ -568,12 +556,9 @@ import type * as websites_helpers from "../websites/helpers.js"; import type * as websites_internal_actions from "../websites/internal_actions.js"; import type * as websites_internal_mutations from "../websites/internal_mutations.js"; import type * as websites_internal_queries from "../websites/internal_queries.js"; -import type * as websites_list_website_pages_paginated from "../websites/list_website_pages_paginated.js"; import type * as websites_list_websites_paginated from "../websites/list_websites_paginated.js"; import type * as websites_mutations from "../websites/mutations.js"; -import type * as websites_provision_website_scan_workflow from "../websites/provision_website_scan_workflow.js"; import type * as websites_queries from "../websites/queries.js"; -import type * as websites_register_urls from "../websites/register_urls.js"; import type * as websites_rescan_website from "../websites/rescan_website.js"; import type * as websites_search_websites from "../websites/search_websites.js"; import type * as websites_types from "../websites/types.js"; @@ -640,8 +625,6 @@ import type * as workflow_engine_action_defs_rag_rag_action from "../workflow_en import type * as workflow_engine_action_defs_set_variables_action from "../workflow_engine/action_defs/set_variables_action.js"; import type * as workflow_engine_action_defs_website_helpers_types from "../workflow_engine/action_defs/website/helpers/types.js"; import type * as workflow_engine_action_defs_website_website_action from "../workflow_engine/action_defs/website/website_action.js"; -import type * as workflow_engine_action_defs_website_pages_helpers_types from "../workflow_engine/action_defs/website_pages/helpers/types.js"; -import type * as workflow_engine_action_defs_website_pages_website_pages_action from "../workflow_engine/action_defs/website_pages/website_pages_action.js"; import type * as workflow_engine_action_defs_workflow_helpers_types from "../workflow_engine/action_defs/workflow/helpers/types.js"; import type * as workflow_engine_action_defs_workflow_helpers_upload_workflows from "../workflow_engine/action_defs/workflow/helpers/upload_workflows.js"; import type * as workflow_engine_action_defs_workflow_workflow_action from "../workflow_engine/action_defs/workflow/workflow_action.js"; @@ -1158,7 +1141,6 @@ declare const fullApi: ApiFromModules<{ "lib/crypto/hex_to_bytes": typeof lib_crypto_hex_to_bytes; "lib/crypto/internal_actions": typeof lib_crypto_internal_actions; "lib/debug_log": typeof lib_debug_log; - "lib/embedding_config": typeof lib_embedding_config; "lib/error_classification": typeof lib_error_classification; "lib/get_or_throw": typeof lib_get_or_throw; "lib/get_user_teams": typeof lib_get_user_teams; @@ -1302,7 +1284,6 @@ declare const fullApi: ApiFromModules<{ "predefined_workflows/product_relationship_analysis": typeof predefined_workflows_product_relationship_analysis; "predefined_workflows/shopify_sync_customers": typeof predefined_workflows_shopify_sync_customers; "predefined_workflows/shopify_sync_products": typeof predefined_workflows_shopify_sync_products; - "predefined_workflows/website_scan": typeof predefined_workflows_website_scan; "predefined_workflows/workflow_rag_sync": typeof predefined_workflows_workflow_rag_sync; "products/create_product": typeof products_create_product; "products/create_product_with_translations": typeof products_create_product_with_translations; @@ -1399,20 +1380,10 @@ declare const fullApi: ApiFromModules<{ "vendors/mutations": typeof vendors_mutations; "vendors/queries": typeof vendors_queries; "vendors/validators": typeof vendors_validators; - "website_page_embeddings/chunk_content": typeof website_page_embeddings_chunk_content; - "website_page_embeddings/content_hash": typeof website_page_embeddings_content_hash; - "website_page_embeddings/embedding_pool": typeof website_page_embeddings_embedding_pool; - "website_page_embeddings/internal_actions": typeof website_page_embeddings_internal_actions; - "website_page_embeddings/internal_mutations": typeof website_page_embeddings_internal_mutations; - "website_page_embeddings/internal_queries": typeof website_page_embeddings_internal_queries; - "website_page_embeddings/rrf": typeof website_page_embeddings_rrf; + "websites/actions": typeof websites_actions; "websites/bulk_create_websites": typeof websites_bulk_create_websites; - "websites/bulk_upsert_pages": typeof websites_bulk_upsert_pages; - "websites/cleanup_website": typeof websites_cleanup_website; "websites/create_website": typeof websites_create_website; "websites/delete_website": typeof websites_delete_website; - "websites/get_page_by_url": typeof websites_get_page_by_url; - "websites/get_pages_by_website": typeof websites_get_pages_by_website; "websites/get_website": typeof websites_get_website; "websites/get_website_by_domain": typeof websites_get_website_by_domain; "websites/get_websites": typeof websites_get_websites; @@ -1420,12 +1391,9 @@ declare const fullApi: ApiFromModules<{ "websites/internal_actions": typeof websites_internal_actions; "websites/internal_mutations": typeof websites_internal_mutations; "websites/internal_queries": typeof websites_internal_queries; - "websites/list_website_pages_paginated": typeof websites_list_website_pages_paginated; "websites/list_websites_paginated": typeof websites_list_websites_paginated; "websites/mutations": typeof websites_mutations; - "websites/provision_website_scan_workflow": typeof websites_provision_website_scan_workflow; "websites/queries": typeof websites_queries; - "websites/register_urls": typeof websites_register_urls; "websites/rescan_website": typeof websites_rescan_website; "websites/search_websites": typeof websites_search_websites; "websites/types": typeof websites_types; @@ -1492,8 +1460,6 @@ declare const fullApi: ApiFromModules<{ "workflow_engine/action_defs/set_variables_action": typeof workflow_engine_action_defs_set_variables_action; "workflow_engine/action_defs/website/helpers/types": typeof workflow_engine_action_defs_website_helpers_types; "workflow_engine/action_defs/website/website_action": typeof workflow_engine_action_defs_website_website_action; - "workflow_engine/action_defs/website_pages/helpers/types": typeof workflow_engine_action_defs_website_pages_helpers_types; - "workflow_engine/action_defs/website_pages/website_pages_action": typeof workflow_engine_action_defs_website_pages_website_pages_action; "workflow_engine/action_defs/workflow/helpers/types": typeof workflow_engine_action_defs_workflow_helpers_types; "workflow_engine/action_defs/workflow/helpers/upload_workflows": typeof workflow_engine_action_defs_workflow_helpers_upload_workflows; "workflow_engine/action_defs/workflow/workflow_action": typeof workflow_engine_action_defs_workflow_workflow_action; @@ -8375,91 +8341,4 @@ export declare const components: { >; }; }; - embeddingPool: { - lib: { - cancel: FunctionReference< - "mutation", - "internal", - { - id: string; - logLevel: "DEBUG" | "TRACE" | "INFO" | "REPORT" | "WARN" | "ERROR"; - }, - any - >; - cancelAll: FunctionReference< - "mutation", - "internal", - { - before?: number; - limit?: number; - logLevel: "DEBUG" | "TRACE" | "INFO" | "REPORT" | "WARN" | "ERROR"; - }, - any - >; - enqueue: FunctionReference< - "mutation", - "internal", - { - config: { - logLevel: "DEBUG" | "TRACE" | "INFO" | "REPORT" | "WARN" | "ERROR"; - maxParallelism: number; - }; - fnArgs: any; - fnHandle: string; - fnName: string; - fnType: "action" | "mutation" | "query"; - onComplete?: { context?: any; fnHandle: string }; - retryBehavior?: { - base: number; - initialBackoffMs: number; - maxAttempts: number; - }; - runAt: number; - }, - string - >; - enqueueBatch: FunctionReference< - "mutation", - "internal", - { - config: { - logLevel: "DEBUG" | "TRACE" | "INFO" | "REPORT" | "WARN" | "ERROR"; - maxParallelism: number; - }; - items: Array<{ - fnArgs: any; - fnHandle: string; - fnName: string; - fnType: "action" | "mutation" | "query"; - onComplete?: { context?: any; fnHandle: string }; - retryBehavior?: { - base: number; - initialBackoffMs: number; - maxAttempts: number; - }; - runAt: number; - }>; - }, - Array - >; - status: FunctionReference< - "query", - "internal", - { id: string }, - | { previousAttempts: number; state: "pending" } - | { previousAttempts: number; state: "running" } - | { state: "finished" } - >; - statusBatch: FunctionReference< - "query", - "internal", - { ids: Array }, - Array< - | { previousAttempts: number; state: "pending" } - | { previousAttempts: number; state: "running" } - | { state: "finished" } - > - >; - }; - }; }; diff --git a/services/platform/convex/agent_tools/database/helpers/schema_definitions.ts b/services/platform/convex/agent_tools/database/helpers/schema_definitions.ts index ed8bc394dd..2d83dd6a69 100644 --- a/services/platform/convex/agent_tools/database/helpers/schema_definitions.ts +++ b/services/platform/convex/agent_tools/database/helpers/schema_definitions.ts @@ -176,19 +176,6 @@ export const TABLE_SCHEMAS: Record = { examples: ['sourceProvider == "onedrive"', 'daysAgo(_creationTime) < 7'], }, - websitePages: { - tableName: 'websitePages', - description: 'Crawled website pages for RAG indexing', - filterableFields: [ - { - field: '_creationTime', - type: 'datetime', - note: 'When the page was crawled. Use with daysAgo() transform.', - }, - ], - examples: ['daysAgo(_creationTime) > 30'], - }, - onedriveSyncConfigs: { tableName: 'onedriveSyncConfigs', description: 'OneDrive sync configuration records', diff --git a/services/platform/convex/agent_tools/web/helpers/query_web_context.ts b/services/platform/convex/agent_tools/web/helpers/query_web_context.ts index cf59166f27..f58236a983 100644 --- a/services/platform/convex/agent_tools/web/helpers/query_web_context.ts +++ b/services/platform/convex/agent_tools/web/helpers/query_web_context.ts @@ -10,8 +10,8 @@ import type { ActionCtx } from '../../../_generated/server'; -import { internal } from '../../../_generated/api'; import { createDebugLog } from '../../../lib/debug_log'; +import { getCrawlerServiceUrl } from './get_crawler_service_url'; const debugLog = createDebugLog('DEBUG_WEB_CONTEXT', '[WebContext]'); @@ -21,26 +21,31 @@ const WEB_CONTEXT_TIMEOUT_MS = 10_000; interface SearchResult { url: string; title?: string; - chunkContent: string; - chunkIndex: number; + chunk_content: string; + chunk_index: number; score: number; } +interface SearchApiResponse { + query: string; + results: SearchResult[]; + total: number; +} + /** * Query crawled website pages and return formatted context string. * * @returns Formatted context string or undefined if no results / on failure */ export async function queryWebContext( - ctx: ActionCtx, - organizationId: string, + _ctx: ActionCtx, + _organizationId: string, query: string, limit = DEFAULT_LIMIT, ): Promise { try { debugLog('Querying web context', { query: query.slice(0, 100), - organizationId, limit, }); @@ -51,13 +56,26 @@ export async function queryWebContext( ); try { - const results: SearchResult[] = await ctx.runAction( - internal.website_page_embeddings.internal_actions.search, - { organizationId, query, limit }, - ); + const crawlerUrl = getCrawlerServiceUrl(); + const response = await fetch(`${crawlerUrl}/api/v1/search`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ query, limit }), + signal: controller.signal, + }); clearTimeout(timeoutId); + if (!response.ok) { + console.error('[web_context] Search API error', { + status: response.status, + }); + return undefined; + } + + const data: SearchApiResponse = await response.json(); + const results = data.results; + if (!results || results.length === 0) { debugLog('No web context results', { query: query.slice(0, 100) }); return undefined; @@ -76,8 +94,8 @@ export async function queryWebContext( const bestScore = Math.max(...chunks.map((c) => c.score)); const title = chunks[0].title ?? url; const contentParts = chunks - .sort((a, b) => a.chunkIndex - b.chunkIndex) - .map((c) => c.chunkContent) + .sort((a, b) => a.chunk_index - b.chunk_index) + .map((c) => c.chunk_content) .join('\n\n'); return { diff --git a/services/platform/convex/agent_tools/web/helpers/search_pages.ts b/services/platform/convex/agent_tools/web/helpers/search_pages.ts index e6aee8a428..65bccb17f2 100644 --- a/services/platform/convex/agent_tools/web/helpers/search_pages.ts +++ b/services/platform/convex/agent_tools/web/helpers/search_pages.ts @@ -1,14 +1,14 @@ /** - * Search crawled website pages using semantic similarity. + * Search crawled website pages using hybrid search (full-text + vector). * - * Calls the internal embedding search action and formats results + * Calls the crawler service search API and formats results * for the LLM — deduplicating by URL and ordering by relevance. */ import type { ToolCtx } from '@convex-dev/agent'; -import { internal } from '../../../_generated/api'; import { createDebugLog } from '../../../lib/debug_log'; +import { getCrawlerServiceUrl } from './get_crawler_service_url'; const debugLog = createDebugLog('DEBUG_AGENT_TOOLS', '[AgentTools]'); @@ -17,30 +17,36 @@ const DEFAULT_LIMIT = 10; interface SearchResult { url: string; title?: string; - chunkContent: string; - chunkIndex: number; + chunk_content: string; + chunk_index: number; score: number; } +interface SearchApiResponse { + query: string; + results: SearchResult[]; + total: number; +} + export async function searchPages( ctx: ToolCtx, args: { query: string }, ): Promise { - const organizationId = ctx.organizationId; - if (!organizationId) { - throw new Error('search_pages requires organizationId in ToolCtx.'); - } - debugLog('web:search_pages start', { query: args.query }); - const results: SearchResult[] = await ctx.runAction( - internal.website_page_embeddings.internal_actions.search, - { - organizationId, - query: args.query, - limit: DEFAULT_LIMIT, - }, - ); + const crawlerUrl = getCrawlerServiceUrl(); + const response = await fetch(`${crawlerUrl}/api/v1/search`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ query: args.query, limit: DEFAULT_LIMIT }), + }); + + if (!response.ok) { + throw new Error(`Search API returned ${response.status}`); + } + + const data: SearchApiResponse = await response.json(); + const results = data.results; if (!results || results.length === 0) { debugLog('web:search_pages no results', { query: args.query }); @@ -65,8 +71,8 @@ export async function searchPages( const bestScore = Math.max(...chunks.map((c) => c.score)); const title = chunks[0].title ?? url; const contentParts = chunks - .sort((a, b) => a.chunkIndex - b.chunkIndex) - .map((c) => c.chunkContent) + .sort((a, b) => a.chunk_index - b.chunk_index) + .map((c) => c.chunk_content) .join('\n\n'); return { diff --git a/services/platform/convex/convex.config.ts b/services/platform/convex/convex.config.ts index 604aee9243..09e0f23df4 100644 --- a/services/platform/convex/convex.config.ts +++ b/services/platform/convex/convex.config.ts @@ -3,7 +3,6 @@ import agent from '@convex-dev/agent/convex.config'; import persistentTextStreaming from '@convex-dev/persistent-text-streaming/convex.config'; import rateLimiter from '@convex-dev/rate-limiter/convex.config'; import workflow from '@convex-dev/workflow/convex.config'; -import workpool from '@convex-dev/workpool/convex.config'; import { defineApp } from 'convex/server'; import betterAuth from './betterAuth/convex.config'; @@ -18,6 +17,5 @@ app.use(agent); app.use(rateLimiter); app.use(actionCache); app.use(persistentTextStreaming); -app.use(workpool, { name: 'embeddingPool' }); export default app; diff --git a/services/platform/convex/lib/embedding_config.test.ts b/services/platform/convex/lib/embedding_config.test.ts deleted file mode 100644 index 612b5c18d9..0000000000 --- a/services/platform/convex/lib/embedding_config.test.ts +++ /dev/null @@ -1,162 +0,0 @@ -import type { EmbeddingModelV2 } from '@ai-sdk/provider'; - -import { afterEach, describe, expect, it, vi } from 'vitest'; - -import { - getEmbeddingDimension, - getEmbeddingTableName, - getRecommendedEmbeddingModel, - SUPPORTED_DIMENSIONS, - withDimensions, -} from './embedding_config'; - -describe('embedding_config', () => { - afterEach(() => { - vi.unstubAllEnvs(); - }); - - describe('SUPPORTED_DIMENSIONS', () => { - it('contains expected dimension values', () => { - expect(SUPPORTED_DIMENSIONS).toEqual([ - 256, 512, 1024, 1536, 2048, 2560, 4096, - ]); - }); - }); - - describe('getEmbeddingDimension', () => { - it('defaults to 1536 when env var not set', () => { - vi.stubEnv('EMBEDDING_DIMENSIONS', ''); - expect(getEmbeddingDimension()).toBe(1536); - }); - - it('returns configured dimension', () => { - vi.stubEnv('EMBEDDING_DIMENSIONS', '256'); - expect(getEmbeddingDimension()).toBe(256); - }); - - it('throws on unsupported dimension', () => { - vi.stubEnv('EMBEDDING_DIMENSIONS', '999'); - expect(() => getEmbeddingDimension()).toThrow( - 'Invalid EMBEDDING_DIMENSIONS', - ); - }); - - it('throws on non-numeric value', () => { - vi.stubEnv('EMBEDDING_DIMENSIONS', 'abc'); - expect(() => getEmbeddingDimension()).toThrow( - 'Invalid EMBEDDING_DIMENSIONS', - ); - }); - }); - - describe('getEmbeddingTableName', () => { - it('returns correct table for each dimension', () => { - const cases: Array<[string, string]> = [ - ['256', 'websitePageEmbeddings256'], - ['512', 'websitePageEmbeddings512'], - ['1024', 'websitePageEmbeddings1024'], - ['1536', 'websitePageEmbeddings1536'], - ['2048', 'websitePageEmbeddings2048'], - ['2560', 'websitePageEmbeddings2560'], - ['4096', 'websitePageEmbeddings4096'], - ]; - - for (const [dim, table] of cases) { - vi.stubEnv('EMBEDDING_DIMENSIONS', dim); - expect(getEmbeddingTableName()).toBe(table); - } - }); - }); - - describe('getRecommendedEmbeddingModel', () => { - it('returns env model when set', () => { - vi.stubEnv('OPENAI_EMBEDDING_MODEL', 'custom-model'); - expect(getRecommendedEmbeddingModel()).toBe('custom-model'); - }); - - it('throws when env not set', () => { - vi.stubEnv('OPENAI_EMBEDDING_MODEL', ''); - expect(() => getRecommendedEmbeddingModel()).toThrow( - 'OPENAI_EMBEDDING_MODEL', - ); - }); - }); - - describe('withDimensions', () => { - function createMockModel(): EmbeddingModelV2 & { - doEmbedCalls: Parameters['doEmbed']>[0][]; - } { - const doEmbedCalls: Parameters['doEmbed']>[0][] = - []; - return { - specificationVersion: 'v2', - modelId: 'test-model', - provider: 'test-provider', - maxEmbeddingsPerCall: 100, - supportsParallelCalls: true, - doEmbedCalls, - async doEmbed( - options: Parameters['doEmbed']>[0], - ) { - doEmbedCalls.push(options); - return { - embeddings: options.values.map(() => [0.1, 0.2, 0.3]), - }; - }, - }; - } - - it('injects dimensions into providerOptions.openai', async () => { - const mock = createMockModel(); - const wrapped = withDimensions(mock, 1024); - - await wrapped.doEmbed({ values: ['hello'] }); - - expect(mock.doEmbedCalls).toHaveLength(1); - expect(mock.doEmbedCalls[0].providerOptions).toEqual({ - openai: { dimensions: 1024 }, - }); - }); - - it('preserves existing providerOptions from caller', async () => { - const mock = createMockModel(); - const wrapped = withDimensions(mock, 512); - - await wrapped.doEmbed({ - values: ['hello'], - providerOptions: { - openai: { user: 'test-user' }, - other: { key: 'value' }, - }, - }); - - expect(mock.doEmbedCalls[0].providerOptions).toEqual({ - openai: { user: 'test-user', dimensions: 512 }, - other: { key: 'value' }, - }); - }); - - it('preserves model properties', () => { - const mock = createMockModel(); - const wrapped = withDimensions(mock, 256); - - expect(wrapped.specificationVersion).toBe('v2'); - expect(wrapped.modelId).toBe('test-model'); - expect(wrapped.provider).toBe('test-provider'); - expect(wrapped.maxEmbeddingsPerCall).toBe(100); - expect(wrapped.supportsParallelCalls).toBe(true); - }); - - it('returns embeddings from the underlying model', async () => { - const mock = createMockModel(); - const wrapped = withDimensions(mock, 1024); - - const result = await wrapped.doEmbed({ values: ['a', 'b'] }); - - expect(result.embeddings).toEqual([ - [0.1, 0.2, 0.3], - [0.1, 0.2, 0.3], - ]); - }); - }); -}); diff --git a/services/platform/convex/lib/embedding_config.ts b/services/platform/convex/lib/embedding_config.ts deleted file mode 100644 index d9f7b1dd6d..0000000000 --- a/services/platform/convex/lib/embedding_config.ts +++ /dev/null @@ -1,104 +0,0 @@ -/** - * Embedding Dimension Configuration - * - * Reads EMBEDDING_DIMENSIONS env var to determine which vector table to use - * and injects the dimensions parameter into embedding API calls. - * Supports: 256, 512, 1024, 1536 (default), 2048, 2560, 4096. - */ - -import type { EmbeddingModelV2 } from '@ai-sdk/provider'; -import type { TableNamesInDataModel } from 'convex/server'; - -import type { DataModel } from '../_generated/dataModel'; - -import { getEnvOrThrow, getEnvWithDefault } from './get_or_throw'; -import { openai } from './openai_provider'; - -export const SUPPORTED_DIMENSIONS = [ - 256, 512, 1024, 1536, 2048, 2560, 4096, -] as const; - -export type SupportedDimension = (typeof SUPPORTED_DIMENSIONS)[number]; - -const DIMENSION_TO_TABLE: Record< - SupportedDimension, - TableNamesInDataModel -> = { - 256: 'websitePageEmbeddings256', - 512: 'websitePageEmbeddings512', - 1024: 'websitePageEmbeddings1024', - 1536: 'websitePageEmbeddings1536', - 2048: 'websitePageEmbeddings2048', - 2560: 'websitePageEmbeddings2560', - 4096: 'websitePageEmbeddings4096', -}; - -function isSupportedDimension(value: number): value is SupportedDimension { - return (SUPPORTED_DIMENSIONS as readonly number[]).includes(value); -} - -export function getEmbeddingDimension(): SupportedDimension { - const raw = getEnvWithDefault('EMBEDDING_DIMENSIONS', '1536'); - const parsed = parseInt(raw, 10); - if (!isSupportedDimension(parsed)) { - throw new Error( - `[Embedding] Invalid EMBEDDING_DIMENSIONS="${raw}". ` + - `Supported: ${SUPPORTED_DIMENSIONS.join(', ')}`, - ); - } - return parsed; -} - -export function getEmbeddingTableName(): TableNamesInDataModel { - return DIMENSION_TO_TABLE[getEmbeddingDimension()]; -} - -export function getRecommendedEmbeddingModel(): string { - return getEnvOrThrow( - 'OPENAI_EMBEDDING_MODEL', - 'Embedding model name (e.g. text-embedding-3-small)', - ); -} - -export function getTextEmbeddingModel(): EmbeddingModelV2 { - const model = openai.embedding(getRecommendedEmbeddingModel()); - return withDimensions(model, getEmbeddingDimension()); -} - -/** - * Wraps an embedding model to inject `dimensions` into every API call - * via `providerOptions.openai.dimensions`. - * - * This is necessary because the `@ai-sdk/openai` `embedding()` factory - * does not accept a dimensions setting, and `@convex-dev/agent`'s - * `embedMany` only forwards `callSettings` (which excludes providerOptions). - * Wrapping at the model level ensures every `doEmbed` call carries the - * configured dimension regardless of the calling code. - */ -export function withDimensions( - model: EmbeddingModelV2, - dimensions: number, -): EmbeddingModelV2 { - const originalDoEmbed = model.doEmbed.bind(model); - return { - specificationVersion: model.specificationVersion, - modelId: model.modelId, - get provider() { - return model.provider; - }, - maxEmbeddingsPerCall: model.maxEmbeddingsPerCall, - supportsParallelCalls: model.supportsParallelCalls, - doEmbed(options: Parameters['doEmbed']>[0]) { - return originalDoEmbed({ - ...options, - providerOptions: { - ...options.providerOptions, - openai: { - ...options.providerOptions?.openai, - dimensions, - }, - }, - }); - }, - }; -} diff --git a/services/platform/convex/lib/rls/helpers/rls_rules.ts b/services/platform/convex/lib/rls/helpers/rls_rules.ts index 797bddb313..6432953682 100644 --- a/services/platform/convex/lib/rls/helpers/rls_rules.ts +++ b/services/platform/convex/lib/rls/helpers/rls_rules.ts @@ -465,18 +465,6 @@ export async function rlsRules( }, }, - // Website Pages - organization-scoped, same access as websites - websitePages: { - read: async (_, page) => { - if (!user) return false; - if (!userOrgIds.has(page.organizationId)) return false; - const membership = userOrganizations.find( - (m) => m.organizationId === page.organizationId, - ); - return authorizeRls(membership?.role, 'websites', 'read'); - }, - }, - // Audit Logs - organization-scoped, allow inserts for org members auditLogs: { read: async (_, log) => { diff --git a/services/platform/convex/predefined_workflows/index.ts b/services/platform/convex/predefined_workflows/index.ts index 09aed95f3a..ded3b49989 100644 --- a/services/platform/convex/predefined_workflows/index.ts +++ b/services/platform/convex/predefined_workflows/index.ts @@ -17,7 +17,6 @@ import productRecommendationEmail from './product_recommendation_email'; import productRelationshipAnalysis from './product_relationship_analysis'; import shopifySyncCustomers from './shopify_sync_customers'; import shopifySyncProducts from './shopify_sync_products'; -import websiteScan from './website_scan'; import workflowRagSync from './workflow_rag_sync'; // Dynamic Orchestration Examples @@ -39,7 +38,6 @@ export const workflows = { productRagSync, customerRagSync, onedriveSync, - websiteScan, workflowRagSync, conversationAutoArchive, conversationSync, diff --git a/services/platform/convex/predefined_workflows/website_scan.ts b/services/platform/convex/predefined_workflows/website_scan.ts deleted file mode 100644 index 24d82abec1..0000000000 --- a/services/platform/convex/predefined_workflows/website_scan.ts +++ /dev/null @@ -1,190 +0,0 @@ -/** - * Website Scan Workflow — Discover URLs, register changes, and sync content - * - * This workflow reads URLs (with content hashes) from the crawler service's - * persistent registry, registers new/changed/deleted pages in Convex, - * and fetches content for new/updated pages. - * - * Flow: - * start → fetch_main_page → update_metadata → query_urls → register_urls - * → crawl_and_upsert_pages → check_has_more → (loop or update_status) - */ - -const websiteScanWorkflow = { - workflowConfig: { - name: 'Website Scan', - description: 'Discover URLs, register changes, and sync page content.', - version: '3.0.0', - workflowType: 'predefined', - config: { - timeout: 1800000, - retryPolicy: { maxRetries: 2, backoffMs: 5000 }, - variables: { - organizationId: 'org_demo', - websiteUrl: 'https://burgenstockresort.com/', - websiteDomain: 'burgenstockresort.com', - scanInterval: '6h', - maxPages: 100, - wordCountThreshold: 100, - crawlerTimeoutMs: 1800000, - offset: 0, - }, - }, - }, - - stepsConfig: [ - { - stepSlug: 'start', - name: 'start', - stepType: 'start', - order: 1, - config: {}, - nextSteps: { success: 'fetch_main_page' }, - }, - - { - stepSlug: 'fetch_main_page', - name: 'Fetch Main Page', - stepType: 'action', - order: 2, - config: { - type: 'crawler', - parameters: { - operation: 'fetch_urls', - urls: ['{{websiteUrl}}'], - wordCountThreshold: '{{wordCountThreshold}}', - timeout: '{{crawlerTimeoutMs}}', - }, - }, - nextSteps: { success: 'update_metadata' }, - }, - - { - stepSlug: 'update_metadata', - name: 'Update Website Metadata', - stepType: 'action', - order: 3, - config: { - type: 'website', - parameters: { - operation: 'update', - websiteId: '{{websiteId}}', - title: - '{{steps.fetch_main_page.output.data.pages[0].metadata.title || steps.fetch_main_page.output.data.pages[0].title || websiteDomain}}', - description: - '{{steps.fetch_main_page.output.data.pages[0].metadata.description || ""}}', - lastScannedAt: '{{nowMs}}', - status: 'active', - metadata: { - scan_status: 'scanning', - last_crawl_timestamp: '{{nowMs}}', - }, - }, - }, - nextSteps: { success: 'query_urls' }, - }, - - { - stepSlug: 'query_urls', - name: 'Query URLs', - stepType: 'action', - order: 4, - config: { - type: 'crawler', - parameters: { - operation: 'query_urls', - domain: '{{websiteDomain}}', - offset: '{{offset}}', - limit: '{{maxPages}}', - timeout: '{{crawlerTimeoutMs}}', - }, - }, - nextSteps: { success: 'register_urls' }, - }, - - { - stepSlug: 'register_urls', - name: 'Register URLs', - stepType: 'action', - order: 5, - config: { - type: 'websitePages', - parameters: { - operation: 'register_urls', - websiteId: '{{websiteId}}', - urls: '{{steps.query_urls.output.data.urls}}', - }, - }, - nextSteps: { success: 'sync_pages' }, - }, - - { - stepSlug: 'sync_pages', - name: 'Sync Pages', - stepType: 'action', - order: 6, - config: { - type: 'websitePages', - parameters: { - operation: 'crawl_and_upsert', - websiteId: '{{websiteId}}', - urls: '{{steps.register_urls.output.data.urlsToSync}}', - wordCountThreshold: '{{wordCountThreshold}}', - crawlerTimeoutMs: '{{crawlerTimeoutMs}}', - }, - }, - nextSteps: { success: 'check_has_more' }, - }, - - { - stepSlug: 'check_has_more', - name: 'Check Has More', - stepType: 'condition', - order: 7, - config: { - expression: 'steps.query_urls.output.data.has_more == true', - }, - nextSteps: { - true: 'update_offset', - false: 'update_status', - }, - }, - - { - stepSlug: 'update_offset', - name: 'Update Offset', - stepType: 'action', - order: 8, - config: { - type: 'set_variables', - parameters: { - variables: [{ name: 'offset', value: '{{offset + maxPages}}' }], - }, - }, - nextSteps: { success: 'query_urls' }, - }, - - { - stepSlug: 'update_status', - name: 'Update Website Status', - stepType: 'action', - order: 9, - config: { - type: 'website', - parameters: { - operation: 'update', - websiteId: '{{websiteId}}', - status: 'active', - lastScannedAt: '{{nowMs}}', - metadata: { - scan_status: 'complete', - last_crawl_timestamp: '{{nowMs}}', - }, - }, - }, - nextSteps: { success: 'noop' }, - }, - ], -}; - -export default websiteScanWorkflow; diff --git a/services/platform/convex/schema.ts b/services/platform/convex/schema.ts index c5a1b795fa..e167dd71f3 100644 --- a/services/platform/convex/schema.ts +++ b/services/platform/convex/schema.ts @@ -18,16 +18,7 @@ import { ssoProvidersTable } from './sso_providers/schema'; import { messageMetadataTable } from './streaming/schema'; import { threadMetadataTable } from './threads/schema'; import { vendorsTable } from './vendors/schema'; -import { - websitePageEmbeddings256Table, - websitePageEmbeddings512Table, - websitePageEmbeddings1024Table, - websitePageEmbeddings1536Table, - websitePageEmbeddings2048Table, - websitePageEmbeddings2560Table, - websitePageEmbeddings4096Table, -} from './website_page_embeddings/schema'; -import { websitesTable, websitePagesTable } from './websites/schema'; +import { websitesTable } from './websites/schema'; import { wfDefinitionsTable, wfExecutionsTable, @@ -60,14 +51,6 @@ export default defineSchema({ products: productsTable, ssoProviders: ssoProvidersTable, vendors: vendorsTable, - websitePageEmbeddings256: websitePageEmbeddings256Table, - websitePageEmbeddings512: websitePageEmbeddings512Table, - websitePageEmbeddings1024: websitePageEmbeddings1024Table, - websitePageEmbeddings1536: websitePageEmbeddings1536Table, - websitePageEmbeddings2048: websitePageEmbeddings2048Table, - websitePageEmbeddings2560: websitePageEmbeddings2560Table, - websitePageEmbeddings4096: websitePageEmbeddings4096Table, - websitePages: websitePagesTable, websites: websitesTable, wfApiKeys: wfApiKeysTable, wfDefinitions: wfDefinitionsTable, diff --git a/services/platform/convex/website_page_embeddings/chunk_content.test.ts b/services/platform/convex/website_page_embeddings/chunk_content.test.ts deleted file mode 100644 index 8faa99784c..0000000000 --- a/services/platform/convex/website_page_embeddings/chunk_content.test.ts +++ /dev/null @@ -1,92 +0,0 @@ -import { describe, expect, it } from 'vitest'; - -import { chunkContent } from './chunk_content'; - -describe('chunkContent', () => { - it('returns empty array for empty content', () => { - expect(chunkContent('')).toEqual([]); - expect(chunkContent(' ')).toEqual([]); - }); - - it('returns single chunk for short content', () => { - const result = chunkContent('Hello world'); - expect(result).toHaveLength(1); - expect(result[0].content).toBe('Hello world'); - expect(result[0].index).toBe(0); - }); - - it('prepends title to chunks', () => { - const result = chunkContent('Some content', 'My Page Title'); - expect(result).toHaveLength(1); - expect(result[0].content).toBe('My Page Title\n\nSome content'); - }); - - it('splits long content into multiple chunks', () => { - const paragraph = 'Lorem ipsum dolor sit amet. '.repeat(100); - const chunkSize = 200; - const chunkOverlap = 50; - const result = chunkContent(paragraph, undefined, chunkSize, chunkOverlap); - - expect(result.length).toBeGreaterThan(1); - // Chunks can exceed chunkSize by up to chunkOverlap because overlap - // text from the previous chunk is prepended to the next chunk. - for (const chunk of result) { - expect(chunk.content.length).toBeLessThanOrEqual( - chunkSize + chunkOverlap, - ); - } - }); - - it('assigns sequential chunk indices', () => { - const longContent = Array.from( - { length: 20 }, - (_, i) => `Paragraph ${i}. This is a test paragraph with enough content.`, - ).join('\n\n'); - - const result = chunkContent(longContent, undefined, 200, 50); - for (let i = 0; i < result.length; i++) { - expect(result[i].index).toBe(i); - } - }); - - it('splits by paragraphs when possible', () => { - // Each paragraph must exceed MIN_CHUNK_LENGTH (50) to not be filtered out - const para1 = - 'First paragraph with enough content to pass the minimum length filter easily.'; - const para2 = - 'Second paragraph also needs sufficient content to exceed the minimum threshold.'; - const content = `${para1}\n\n${para2}`; - const result = chunkContent(content, undefined, 100, 10); - - expect(result.length).toBeGreaterThanOrEqual(2); - }); - - it('handles content with only whitespace between paragraphs', () => { - const content = 'First.\n\n\n\nSecond.'; - const result = chunkContent(content); - expect(result.length).toBeGreaterThanOrEqual(1); - expect(result[0].content).toContain('First.'); - }); - - it('filters out chunks shorter than minimum length', () => { - const longParagraph = - 'This is a sufficiently long paragraph that has exactly enough words to be useful for testing.'; - const shortParagraph = 'OK.'; - const content = `${longParagraph}\n\n${shortParagraph}\n\n${longParagraph}`; - const result = chunkContent(content, undefined, 100, 1); - - expect(result.length).toBeGreaterThanOrEqual(1); - for (const chunk of result) { - expect(chunk.content.length).toBeGreaterThanOrEqual(50); - } - }); - - it('handles title taking most of chunk size', () => { - const longTitle = 'A'.repeat(1400); - const content = 'Short content here.'; - const result = chunkContent(content, longTitle); - expect(result).toHaveLength(1); - expect(result[0].content).toContain(longTitle); - expect(result[0].content).toContain(content); - }); -}); diff --git a/services/platform/convex/website_page_embeddings/chunk_content.ts b/services/platform/convex/website_page_embeddings/chunk_content.ts deleted file mode 100644 index a00f570bd3..0000000000 --- a/services/platform/convex/website_page_embeddings/chunk_content.ts +++ /dev/null @@ -1,128 +0,0 @@ -/** - * Content Chunking for Embeddings - * - * Splits page content into overlapping chunks suitable for embedding generation. - * Splits by paragraphs first, then sentences, with configurable size and overlap. - */ - -const DEFAULT_CHUNK_SIZE = 1500; -const DEFAULT_CHUNK_OVERLAP = 200; -const MIN_CHUNK_LENGTH = 50; - -export interface ContentChunk { - content: string; - index: number; -} - -export function chunkContent( - content: string, - title?: string, - chunkSize = DEFAULT_CHUNK_SIZE, - chunkOverlap = DEFAULT_CHUNK_OVERLAP, -): ContentChunk[] { - const trimmed = content.trim(); - if (!trimmed) return []; - - const prefix = title ? `${title}\n\n` : ''; - const effectiveChunkSize = chunkSize - prefix.length; - - if (effectiveChunkSize <= MIN_CHUNK_LENGTH) { - return [{ content: prefix + trimmed, index: 0 }]; - } - - // If content fits in one chunk, return as-is - if (trimmed.length <= effectiveChunkSize) { - return [{ content: prefix + trimmed, index: 0 }]; - } - - const paragraphs = splitIntoParagraphs(trimmed); - const rawChunks = mergeIntoChunks( - paragraphs, - effectiveChunkSize, - chunkOverlap, - ); - - return rawChunks - .filter((c) => c.length >= MIN_CHUNK_LENGTH) - .map((c, i) => ({ content: prefix + c, index: i })); -} - -function splitIntoParagraphs(text: string): string[] { - return text - .split(/\n\s*\n/) - .map((p) => p.trim()) - .filter(Boolean); -} - -function mergeIntoChunks( - segments: string[], - maxSize: number, - overlap: number, -): string[] { - const chunks: string[] = []; - let current = ''; - - for (const segment of segments) { - // If a single segment exceeds maxSize, split it by sentences - if (segment.length > maxSize) { - if (current) { - chunks.push(current.trim()); - current = getOverlapText(current, overlap); - } - const sentenceChunks = splitBySentences(segment, maxSize, overlap); - for (const sc of sentenceChunks) { - chunks.push((current + sc).trim()); - current = getOverlapText(sc, overlap); - } - continue; - } - - const combined = current ? current + '\n\n' + segment : segment; - if (combined.length <= maxSize) { - current = combined; - } else { - chunks.push(current.trim()); - current = getOverlapText(current, overlap) + segment; - } - } - - if (current.trim()) { - chunks.push(current.trim()); - } - - return chunks; -} - -function splitBySentences( - text: string, - maxSize: number, - overlap: number, -): string[] { - const sentences = text.match(/[^.!?]+[.!?]+\s*/g) || [text]; - const chunks: string[] = []; - let current = ''; - - for (const sentence of sentences) { - const combined = current + sentence; - if (combined.length <= maxSize) { - current = combined; - } else { - if (current) chunks.push(current.trim()); - current = getOverlapText(current, overlap) + sentence; - } - } - - if (current.trim()) { - chunks.push(current.trim()); - } - - return chunks; -} - -function getOverlapText(text: string, overlap: number): string { - if (text.length <= overlap) return text; - const slice = text.slice(-overlap); - // Try to start at a word boundary - const wordBoundary = slice.indexOf(' '); - return wordBoundary > 0 ? slice.slice(wordBoundary + 1) : slice; -} diff --git a/services/platform/convex/website_page_embeddings/content_hash.test.ts b/services/platform/convex/website_page_embeddings/content_hash.test.ts deleted file mode 100644 index 1c18d5870b..0000000000 --- a/services/platform/convex/website_page_embeddings/content_hash.test.ts +++ /dev/null @@ -1,38 +0,0 @@ -import { describe, expect, it } from 'vitest'; - -import { computeContentHash } from './content_hash'; - -describe('computeContentHash', () => { - it('returns consistent hash for same input', () => { - const hash1 = computeContentHash('hello world'); - const hash2 = computeContentHash('hello world'); - expect(hash1).toBe(hash2); - }); - - it('returns different hashes for different input', () => { - const hash1 = computeContentHash('hello world'); - const hash2 = computeContentHash('hello world!'); - expect(hash1).not.toBe(hash2); - }); - - it('returns 8-character hex string', () => { - const hash = computeContentHash('test'); - expect(hash).toMatch(/^[0-9a-f]{8}$/); - }); - - it('handles empty string', () => { - const hash = computeContentHash(''); - expect(hash).toMatch(/^[0-9a-f]{8}$/); - }); - - it('handles long content', () => { - const longContent = 'x'.repeat(100000); - const hash = computeContentHash(longContent); - expect(hash).toMatch(/^[0-9a-f]{8}$/); - }); - - it('handles unicode content', () => { - const hash = computeContentHash('你好世界 🌍'); - expect(hash).toMatch(/^[0-9a-f]{8}$/); - }); -}); diff --git a/services/platform/convex/website_page_embeddings/content_hash.ts b/services/platform/convex/website_page_embeddings/content_hash.ts deleted file mode 100644 index b33ec38c47..0000000000 --- a/services/platform/convex/website_page_embeddings/content_hash.ts +++ /dev/null @@ -1,14 +0,0 @@ -/** - * Content Hash - * - * Simple hash for detecting content changes in website pages. - * Uses DJB2 algorithm - fast, deterministic, and sufficient for change detection. - */ - -export function computeContentHash(content: string): string { - let hash = 5381; - for (let i = 0; i < content.length; i++) { - hash = ((hash << 5) + hash + content.charCodeAt(i)) | 0; - } - return (hash >>> 0).toString(16).padStart(8, '0'); -} diff --git a/services/platform/convex/website_page_embeddings/embedding_pool.ts b/services/platform/convex/website_page_embeddings/embedding_pool.ts deleted file mode 100644 index 894b136306..0000000000 --- a/services/platform/convex/website_page_embeddings/embedding_pool.ts +++ /dev/null @@ -1,18 +0,0 @@ -import { Workpool } from '@convex-dev/workpool'; - -import { components } from '../_generated/api'; - -// Serialize embedding generation to avoid overwhelming the embedding API -// and Convex action concurrency. During bulk scans, hundreds of pages may -// need embeddings across multiple batches — without a pool, all embedding -// actions would fire concurrently, causing rate-limit failures and server -// instability. -export const embeddingPool = new Workpool(components.embeddingPool, { - maxParallelism: 1, - retryActionsByDefault: true, - defaultRetryBehavior: { - maxAttempts: 3, - initialBackoffMs: 10_000, - base: 2, - }, -}); diff --git a/services/platform/convex/website_page_embeddings/internal_actions.ts b/services/platform/convex/website_page_embeddings/internal_actions.ts deleted file mode 100644 index fa48dcc74a..0000000000 --- a/services/platform/convex/website_page_embeddings/internal_actions.ts +++ /dev/null @@ -1,414 +0,0 @@ -/** - * Internal Actions for Website Page Embeddings - * - * Handles embedding generation (chunking + agent component) and vector search. - */ - -import { embedMany } from '@convex-dev/agent'; -import { v } from 'convex/values'; - -import type { Id } from '../_generated/dataModel'; -import type { ActionCtx } from '../_generated/server'; - -import { internal } from '../_generated/api'; -import { internalAction } from '../_generated/server'; -import { createDebugLog } from '../lib/debug_log'; -import { - getEmbeddingDimension, - getTextEmbeddingModel, -} from '../lib/embedding_config'; -import { classifyError } from '../lib/error_classification'; -import { chunkContent } from './chunk_content'; -import { computeContentHash } from './content_hash'; -import { mergeWithRRF } from './rrf'; - -const MAX_SEARCH_LIMIT = 256; - -const debugLog = createDebugLog('DEBUG_EMBEDDINGS', '[WebsitePageEmbeddings]'); - -export const embedPage = internalAction({ - args: { - organizationId: v.string(), - websiteId: v.id('websites'), - pageId: v.id('websitePages'), - }, - handler: async (ctx, args) => { - const { organizationId, websiteId, pageId } = args; - const dimension = getEmbeddingDimension(); - - debugLog('embedPage start', { pageId, dimension }); - - // 1. Load page content - const page = await ctx.runQuery( - internal.website_page_embeddings.internal_queries.getPageById, - { pageId }, - ); - - if (!page || !page.content) { - debugLog('embedPage skip - no content', { pageId }); - return { status: 'skipped', reason: 'no_content' }; - } - - // 2. Check content hash to skip if unchanged - const contentHash = computeContentHash(page.content); - const existing = await ctx.runQuery( - internal.website_page_embeddings.internal_queries - .getExistingEmbeddingHash, - { organizationId, pageId, dimension }, - ); - - if (existing?.contentHash === contentHash) { - debugLog('embedPage skip - content unchanged', { - pageId, - contentHash, - }); - return { status: 'skipped', reason: 'unchanged' }; - } - - // 3. Chunk content - const chunks = chunkContent(page.content, page.title); - if (chunks.length === 0) { - debugLog('embedPage skip - no chunks', { pageId }); - return { status: 'skipped', reason: 'no_chunks' }; - } - - debugLog('embedPage chunked', { - pageId, - chunkCount: chunks.length, - }); - - // 4. Generate embeddings via agent component - // Retries are handled by the workpool (embeddingPool) — just throw on failure. - const textEmbeddingModel = getTextEmbeddingModel(); - const { embeddings } = await embedMany(ctx, { - userId: undefined, - threadId: undefined, - values: chunks.map((c) => c.content), - textEmbeddingModel, - }); - - debugLog('embedPage embedded', { - pageId, - embeddingCount: embeddings.length, - dimension, - }); - - // 5. Delete old embeddings for this page - await ctx.runMutation( - internal.website_page_embeddings.internal_mutations.deleteByPageId, - { organizationId, pageId, dimension }, - ); - - // 6. Store new embeddings - const insertMutation = getInsertMutation(dimension); - for (const [i, chunk] of chunks.entries()) { - await ctx.runMutation(insertMutation, { - organizationId, - websiteId, - pageId, - embedding: embeddings[i], - url: page.url, - title: page.title, - contentHash, - chunkIndex: chunk.index, - chunkContent: chunk.content, - }); - } - - debugLog('embedPage complete', { - pageId, - chunksStored: chunks.length, - }); - - return { status: 'success', chunks: chunks.length }; - }, -}); - -interface SearchResult { - url: string; - title?: string; - chunkContent: string; - chunkIndex: number; - score: number; -} - -export const search = internalAction({ - args: { - organizationId: v.string(), - websiteId: v.optional(v.id('websites')), - query: v.string(), - limit: v.optional(v.number()), - }, - handler: async (ctx, args): Promise => { - const { organizationId, query } = args; - const limit = Math.min(Math.max(args.limit ?? 10, 1), MAX_SEARCH_LIMIT); - const dimension = getEmbeddingDimension(); - - debugLog('search start', { organizationId, query, dimension }); - - // 1. Generate query embedding via agent component (with inline retry) - const textEmbeddingModel = getTextEmbeddingModel(); - let queryEmbedding: number[]; - - try { - const result = await embedMany(ctx, { - userId: undefined, - threadId: undefined, - values: [query], - textEmbeddingModel, - }); - queryEmbedding = result.embeddings[0]; - } catch (error) { - const classification = classifyError(error); - if (!classification.shouldRetry) throw error; - - debugLog('search embedMany failed, retrying inline', { - query, - reason: classification.reason, - }); - await new Promise((resolve) => setTimeout(resolve, 1_000)); - - const retryResult = await embedMany(ctx, { - userId: undefined, - threadId: undefined, - values: [query], - textEmbeddingModel, - }); - queryEmbedding = retryResult.embeddings[0]; - } - - // 2. Run vector search + full-text search in parallel - const searchLimit = args.websiteId - ? Math.min(limit * 3, MAX_SEARCH_LIMIT) - : limit; - - const [vectorResults, fullTextResults] = await Promise.all([ - runVectorSearch( - ctx, - dimension, - queryEmbedding, - organizationId, - searchLimit, - ), - ctx.runQuery( - internal.website_page_embeddings.internal_queries.fullTextSearch, - { - organizationId, - websiteId: args.websiteId, - query, - limit: searchLimit, - dimension, - }, - ), - ]); - - // 3. Fetch full records for vector results - const vectorRecords = await fetchResultRecords( - ctx, - dimension, - vectorResults, - ); - - // 4. Build ranked lists for RRF merge - const vectorRanked = vectorRecords - .map((record, i) => - record - ? { - id: vectorResults[i]._id, - url: record.url, - title: record.title, - chunkContent: record.chunkContent, - chunkIndex: record.chunkIndex, - websiteId: record.websiteId, - } - : null, - ) - .filter((r): r is NonNullable => r != null) - .filter((r) => !args.websiteId || r.websiteId === args.websiteId); - - const ftRanked = fullTextResults.map((r) => ({ - id: r._id, - url: r.url, - title: r.title, - chunkContent: r.chunkContent, - chunkIndex: r.chunkIndex, - websiteId: r.websiteId, - })); - - // 5. Merge with RRF - const merged = mergeWithRRF([vectorRanked, ftRanked], limit); - - debugLog('search complete', { - query, - vectorCount: vectorRanked.length, - ftCount: ftRanked.length, - mergedCount: merged.length, - }); - - return merged.map((r) => ({ - url: r.url, - title: r.title, - chunkContent: r.chunkContent, - chunkIndex: r.chunkIndex, - score: r.score, - })); - }, -}); - -function getInsertMutation(dimension: number) { - switch (dimension) { - case 256: - return internal.website_page_embeddings.internal_mutations - .insertEmbedding256; - case 512: - return internal.website_page_embeddings.internal_mutations - .insertEmbedding512; - case 1024: - return internal.website_page_embeddings.internal_mutations - .insertEmbedding1024; - case 1536: - return internal.website_page_embeddings.internal_mutations - .insertEmbedding1536; - case 2048: - return internal.website_page_embeddings.internal_mutations - .insertEmbedding2048; - case 2560: - return internal.website_page_embeddings.internal_mutations - .insertEmbedding2560; - case 4096: - return internal.website_page_embeddings.internal_mutations - .insertEmbedding4096; - default: - throw new Error(`Unsupported embedding dimension: ${dimension}`); - } -} - -interface VectorSearchResult { - _id: string; - _score: number; -} - -async function runVectorSearch( - ctx: ActionCtx, - dimension: number, - vector: number[], - organizationId: string, - limit: number, -): Promise { - // Convex vector search filters only support eq + or (no and), so we filter - // by organizationId here and post-filter by websiteId in the caller. - switch (dimension) { - case 256: - return ctx.vectorSearch('websitePageEmbeddings256', 'by_embedding', { - vector, - limit, - filter: (q) => q.eq('organizationId', organizationId), - }); - case 512: - return ctx.vectorSearch('websitePageEmbeddings512', 'by_embedding', { - vector, - limit, - filter: (q) => q.eq('organizationId', organizationId), - }); - case 1024: - return ctx.vectorSearch('websitePageEmbeddings1024', 'by_embedding', { - vector, - limit, - filter: (q) => q.eq('organizationId', organizationId), - }); - case 1536: - return ctx.vectorSearch('websitePageEmbeddings1536', 'by_embedding', { - vector, - limit, - filter: (q) => q.eq('organizationId', organizationId), - }); - case 2048: - return ctx.vectorSearch('websitePageEmbeddings2048', 'by_embedding', { - vector, - limit, - filter: (q) => q.eq('organizationId', organizationId), - }); - case 2560: - return ctx.vectorSearch('websitePageEmbeddings2560', 'by_embedding', { - vector, - limit, - filter: (q) => q.eq('organizationId', organizationId), - }); - case 4096: - return ctx.vectorSearch('websitePageEmbeddings4096', 'by_embedding', { - vector, - limit, - filter: (q) => q.eq('organizationId', organizationId), - }); - default: - return []; - } -} - -interface EmbeddingRecord { - url: string; - title?: string; - chunkContent: string; - chunkIndex: number; - websiteId: Id<'websites'>; -} - -async function fetchResultRecords( - ctx: ActionCtx, - dimension: number, - results: VectorSearchResult[], -): Promise> { - const ids = results.map((r) => r._id); - - switch (dimension) { - case 256: - return await ctx.runQuery( - internal.website_page_embeddings.internal_queries.fetchSearchResults256, - // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- vectorSearch returns string IDs that match the table's Id type - { ids: ids as unknown as Id<'websitePageEmbeddings256'>[] }, - ); - case 512: - return await ctx.runQuery( - internal.website_page_embeddings.internal_queries.fetchSearchResults512, - // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- vectorSearch returns string IDs that match the table's Id type - { ids: ids as unknown as Id<'websitePageEmbeddings512'>[] }, - ); - case 1024: - return await ctx.runQuery( - internal.website_page_embeddings.internal_queries - .fetchSearchResults1024, - // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- vectorSearch returns string IDs that match the table's Id type - { ids: ids as unknown as Id<'websitePageEmbeddings1024'>[] }, - ); - case 1536: - return await ctx.runQuery( - internal.website_page_embeddings.internal_queries - .fetchSearchResults1536, - // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- vectorSearch returns string IDs that match the table's Id type - { ids: ids as unknown as Id<'websitePageEmbeddings1536'>[] }, - ); - case 2048: - return await ctx.runQuery( - internal.website_page_embeddings.internal_queries - .fetchSearchResults2048, - // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- vectorSearch returns string IDs that match the table's Id type - { ids: ids as unknown as Id<'websitePageEmbeddings2048'>[] }, - ); - case 2560: - return await ctx.runQuery( - internal.website_page_embeddings.internal_queries - .fetchSearchResults2560, - // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- vectorSearch returns string IDs that match the table's Id type - { ids: ids as unknown as Id<'websitePageEmbeddings2560'>[] }, - ); - case 4096: - return await ctx.runQuery( - internal.website_page_embeddings.internal_queries - .fetchSearchResults4096, - // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- vectorSearch returns string IDs that match the table's Id type - { ids: ids as unknown as Id<'websitePageEmbeddings4096'>[] }, - ); - default: - return []; - } -} diff --git a/services/platform/convex/website_page_embeddings/internal_mutations.ts b/services/platform/convex/website_page_embeddings/internal_mutations.ts deleted file mode 100644 index 8ee91e3b99..0000000000 --- a/services/platform/convex/website_page_embeddings/internal_mutations.ts +++ /dev/null @@ -1,194 +0,0 @@ -/** - * Internal Mutations for Website Page Embeddings - */ - -import { v } from 'convex/values'; - -import { internalMutation } from '../_generated/server'; - -const embeddingRecordFields = { - organizationId: v.string(), - websiteId: v.id('websites'), - pageId: v.id('websitePages'), - embedding: v.array(v.float64()), - url: v.string(), - title: v.optional(v.string()), - contentHash: v.string(), - chunkIndex: v.number(), - chunkContent: v.string(), -}; - -export const deleteByPageId = internalMutation({ - args: { - organizationId: v.string(), - pageId: v.id('websitePages'), - dimension: v.number(), - }, - handler: async (ctx, args) => { - const { organizationId, pageId, dimension } = args; - - const indexFilter = (q: { eq: Function }) => - q.eq('organizationId', organizationId).eq('pageId', pageId); - - let count = 0; - switch (dimension) { - case 256: { - const q = ctx.db - .query('websitePageEmbeddings256') - .withIndex('by_organizationId_and_pageId', indexFilter); - for await (const r of q) { - await ctx.db.delete(r._id); - count++; - } - break; - } - case 512: { - const q = ctx.db - .query('websitePageEmbeddings512') - .withIndex('by_organizationId_and_pageId', indexFilter); - for await (const r of q) { - await ctx.db.delete(r._id); - count++; - } - break; - } - case 1024: { - const q = ctx.db - .query('websitePageEmbeddings1024') - .withIndex('by_organizationId_and_pageId', indexFilter); - for await (const r of q) { - await ctx.db.delete(r._id); - count++; - } - break; - } - case 1536: { - const q = ctx.db - .query('websitePageEmbeddings1536') - .withIndex('by_organizationId_and_pageId', indexFilter); - for await (const r of q) { - await ctx.db.delete(r._id); - count++; - } - break; - } - case 2048: { - const q = ctx.db - .query('websitePageEmbeddings2048') - .withIndex('by_organizationId_and_pageId', indexFilter); - for await (const r of q) { - await ctx.db.delete(r._id); - count++; - } - break; - } - case 2560: { - const q = ctx.db - .query('websitePageEmbeddings2560') - .withIndex('by_organizationId_and_pageId', indexFilter); - for await (const r of q) { - await ctx.db.delete(r._id); - count++; - } - break; - } - case 4096: { - const q = ctx.db - .query('websitePageEmbeddings4096') - .withIndex('by_organizationId_and_pageId', indexFilter); - for await (const r of q) { - await ctx.db.delete(r._id); - count++; - } - break; - } - default: - throw new Error(`Unsupported embedding dimension: ${dimension}`); - } - return count; - }, -}); - -export const insertEmbedding256 = internalMutation({ - args: embeddingRecordFields, - handler: async (ctx, args) => { - const now = Date.now(); - return await ctx.db.insert('websitePageEmbeddings256', { - ...args, - createdAt: now, - updatedAt: now, - }); - }, -}); - -export const insertEmbedding512 = internalMutation({ - args: embeddingRecordFields, - handler: async (ctx, args) => { - const now = Date.now(); - return await ctx.db.insert('websitePageEmbeddings512', { - ...args, - createdAt: now, - updatedAt: now, - }); - }, -}); - -export const insertEmbedding1024 = internalMutation({ - args: embeddingRecordFields, - handler: async (ctx, args) => { - const now = Date.now(); - return await ctx.db.insert('websitePageEmbeddings1024', { - ...args, - createdAt: now, - updatedAt: now, - }); - }, -}); - -export const insertEmbedding1536 = internalMutation({ - args: embeddingRecordFields, - handler: async (ctx, args) => { - const now = Date.now(); - return await ctx.db.insert('websitePageEmbeddings1536', { - ...args, - createdAt: now, - updatedAt: now, - }); - }, -}); - -export const insertEmbedding2048 = internalMutation({ - args: embeddingRecordFields, - handler: async (ctx, args) => { - const now = Date.now(); - return await ctx.db.insert('websitePageEmbeddings2048', { - ...args, - createdAt: now, - updatedAt: now, - }); - }, -}); - -export const insertEmbedding2560 = internalMutation({ - args: embeddingRecordFields, - handler: async (ctx, args) => { - const now = Date.now(); - return await ctx.db.insert('websitePageEmbeddings2560', { - ...args, - createdAt: now, - updatedAt: now, - }); - }, -}); - -export const insertEmbedding4096 = internalMutation({ - args: embeddingRecordFields, - handler: async (ctx, args) => { - const now = Date.now(); - return await ctx.db.insert('websitePageEmbeddings4096', { - ...args, - createdAt: now, - updatedAt: now, - }); - }, -}); diff --git a/services/platform/convex/website_page_embeddings/internal_queries.ts b/services/platform/convex/website_page_embeddings/internal_queries.ts deleted file mode 100644 index afa6dd7be7..0000000000 --- a/services/platform/convex/website_page_embeddings/internal_queries.ts +++ /dev/null @@ -1,189 +0,0 @@ -/** - * Internal Queries for Website Page Embeddings - */ - -import { v } from 'convex/values'; - -import { internalQuery } from '../_generated/server'; - -export const getPageById = internalQuery({ - args: { pageId: v.id('websitePages') }, - handler: async (ctx, args) => { - return await ctx.db.get(args.pageId); - }, -}); - -export const getExistingEmbeddingHash = internalQuery({ - args: { - organizationId: v.string(), - pageId: v.id('websitePages'), - dimension: v.number(), - }, - handler: async (ctx, args) => { - const { organizationId, pageId, dimension } = args; - - const indexFilter = (q: { eq: Function }) => - q.eq('organizationId', organizationId).eq('pageId', pageId); - - let record; - switch (dimension) { - case 256: - record = await ctx.db - .query('websitePageEmbeddings256') - .withIndex('by_organizationId_and_pageId', indexFilter) - .first(); - break; - case 512: - record = await ctx.db - .query('websitePageEmbeddings512') - .withIndex('by_organizationId_and_pageId', indexFilter) - .first(); - break; - case 1024: - record = await ctx.db - .query('websitePageEmbeddings1024') - .withIndex('by_organizationId_and_pageId', indexFilter) - .first(); - break; - case 1536: - record = await ctx.db - .query('websitePageEmbeddings1536') - .withIndex('by_organizationId_and_pageId', indexFilter) - .first(); - break; - case 2048: - record = await ctx.db - .query('websitePageEmbeddings2048') - .withIndex('by_organizationId_and_pageId', indexFilter) - .first(); - break; - case 2560: - record = await ctx.db - .query('websitePageEmbeddings2560') - .withIndex('by_organizationId_and_pageId', indexFilter) - .first(); - break; - case 4096: - record = await ctx.db - .query('websitePageEmbeddings4096') - .withIndex('by_organizationId_and_pageId', indexFilter) - .first(); - break; - default: - return null; - } - - return record ? { contentHash: record.contentHash } : null; - }, -}); - -export const fullTextSearch = internalQuery({ - args: { - organizationId: v.string(), - websiteId: v.optional(v.id('websites')), - query: v.string(), - limit: v.number(), - dimension: v.number(), - }, - handler: async (ctx, args) => { - const { organizationId, websiteId, query, limit, dimension } = args; - - const searchFilter = websiteId - ? (q: { search: Function }) => - q - .search('chunkContent', query) - .eq('organizationId', organizationId) - .eq('websiteId', websiteId) - : (q: { search: Function }) => - q.search('chunkContent', query).eq('organizationId', organizationId); - - switch (dimension) { - case 256: - return await ctx.db - .query('websitePageEmbeddings256') - .withSearchIndex('by_content', searchFilter) - .take(limit); - case 512: - return await ctx.db - .query('websitePageEmbeddings512') - .withSearchIndex('by_content', searchFilter) - .take(limit); - case 1024: - return await ctx.db - .query('websitePageEmbeddings1024') - .withSearchIndex('by_content', searchFilter) - .take(limit); - case 1536: - return await ctx.db - .query('websitePageEmbeddings1536') - .withSearchIndex('by_content', searchFilter) - .take(limit); - case 2048: - return await ctx.db - .query('websitePageEmbeddings2048') - .withSearchIndex('by_content', searchFilter) - .take(limit); - case 2560: - return await ctx.db - .query('websitePageEmbeddings2560') - .withSearchIndex('by_content', searchFilter) - .take(limit); - case 4096: - return await ctx.db - .query('websitePageEmbeddings4096') - .withSearchIndex('by_content', searchFilter) - .take(limit); - default: - return []; - } - }, -}); - -export const fetchSearchResults256 = internalQuery({ - args: { ids: v.array(v.id('websitePageEmbeddings256')) }, - handler: async (ctx, args) => { - return await Promise.all(args.ids.map((id) => ctx.db.get(id))); - }, -}); - -export const fetchSearchResults512 = internalQuery({ - args: { ids: v.array(v.id('websitePageEmbeddings512')) }, - handler: async (ctx, args) => { - return await Promise.all(args.ids.map((id) => ctx.db.get(id))); - }, -}); - -export const fetchSearchResults1024 = internalQuery({ - args: { ids: v.array(v.id('websitePageEmbeddings1024')) }, - handler: async (ctx, args) => { - return await Promise.all(args.ids.map((id) => ctx.db.get(id))); - }, -}); - -export const fetchSearchResults1536 = internalQuery({ - args: { ids: v.array(v.id('websitePageEmbeddings1536')) }, - handler: async (ctx, args) => { - return await Promise.all(args.ids.map((id) => ctx.db.get(id))); - }, -}); - -export const fetchSearchResults2048 = internalQuery({ - args: { ids: v.array(v.id('websitePageEmbeddings2048')) }, - handler: async (ctx, args) => { - return await Promise.all(args.ids.map((id) => ctx.db.get(id))); - }, -}); - -export const fetchSearchResults2560 = internalQuery({ - args: { ids: v.array(v.id('websitePageEmbeddings2560')) }, - handler: async (ctx, args) => { - return await Promise.all(args.ids.map((id) => ctx.db.get(id))); - }, -}); - -export const fetchSearchResults4096 = internalQuery({ - args: { ids: v.array(v.id('websitePageEmbeddings4096')) }, - handler: async (ctx, args) => { - return await Promise.all(args.ids.map((id) => ctx.db.get(id))); - }, -}); diff --git a/services/platform/convex/website_page_embeddings/rrf.test.ts b/services/platform/convex/website_page_embeddings/rrf.test.ts deleted file mode 100644 index 01e449e5c6..0000000000 --- a/services/platform/convex/website_page_embeddings/rrf.test.ts +++ /dev/null @@ -1,69 +0,0 @@ -import { describe, expect, it } from 'vitest'; - -import { mergeWithRRF } from './rrf'; - -describe('mergeWithRRF', () => { - it('merges two ranked lists and boosts items appearing in both', () => { - const vectorResults = [ - { id: 'a', content: 'alpha' }, - { id: 'b', content: 'beta' }, - { id: 'c', content: 'gamma' }, - ]; - const textResults = [ - { id: 'b', content: 'beta' }, - { id: 'd', content: 'delta' }, - { id: 'a', content: 'alpha' }, - ]; - - const merged = mergeWithRRF([vectorResults, textResults], 10); - - // 'b' appears in both lists (rank 1 in vector, rank 0 in text) → highest combined score - // 'a' appears in both lists (rank 0 in vector, rank 2 in text) → second highest - expect(merged[0].id).toBe('b'); - expect(merged[1].id).toBe('a'); - expect(merged.length).toBe(4); - }); - - it('respects the limit parameter', () => { - const list1 = [{ id: 'a' }, { id: 'b' }, { id: 'c' }]; - const list2 = [{ id: 'd' }, { id: 'e' }]; - - const merged = mergeWithRRF([list1, list2], 3); - - expect(merged.length).toBe(3); - }); - - it('handles empty lists', () => { - const merged = mergeWithRRF([[], []], 10); - expect(merged).toEqual([]); - }); - - it('handles single list', () => { - const list = [{ id: 'a' }, { id: 'b' }]; - const merged = mergeWithRRF([list], 10); - - expect(merged.length).toBe(2); - expect(merged[0].id).toBe('a'); - expect(merged[1].id).toBe('b'); - }); - - it('preserves item properties from first occurrence', () => { - const list1 = [{ id: 'a', source: 'vector' }]; - const list2 = [{ id: 'a', source: 'text' }]; - - const merged = mergeWithRRF([list1, list2], 10); - - expect(merged[0].id).toBe('a'); - expect(merged[0].source).toBe('vector'); - }); - - it('assigns RRF scores to results', () => { - const list = [{ id: 'a' }, { id: 'b' }]; - const merged = mergeWithRRF([list], 10); - - // rank 0: 1/(60+1) ≈ 0.01639 - // rank 1: 1/(60+2) ≈ 0.01613 - expect(merged[0].score).toBeCloseTo(1 / 61, 5); - expect(merged[1].score).toBeCloseTo(1 / 62, 5); - }); -}); diff --git a/services/platform/convex/website_page_embeddings/rrf.ts b/services/platform/convex/website_page_embeddings/rrf.ts deleted file mode 100644 index aa490971a9..0000000000 --- a/services/platform/convex/website_page_embeddings/rrf.ts +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Reciprocal Rank Fusion (RRF) - * - * Merges ranked results from multiple search methods (vector + full-text) - * into a single ranked list. Standard constant k=60. - */ - -const RRF_K = 60; - -interface RankedItem { - id: string; -} - -export function mergeWithRRF( - rankedLists: T[][], - limit: number, -): Array { - const scores = new Map(); - const items = new Map(); - - for (const list of rankedLists) { - for (const [rank, item] of list.entries()) { - const rrfScore = 1 / (RRF_K + rank + 1); - scores.set(item.id, (scores.get(item.id) ?? 0) + rrfScore); - if (!items.has(item.id)) { - items.set(item.id, item); - } - } - } - - const results: Array = []; - const sorted = [...scores.entries()].sort((a, b) => b[1] - a[1]); - for (const [id, score] of sorted.slice(0, limit)) { - const item = items.get(id); - if (item) { - results.push({ ...item, score }); - } - } - return results; -} diff --git a/services/platform/convex/website_page_embeddings/schema.ts b/services/platform/convex/website_page_embeddings/schema.ts deleted file mode 100644 index d6b5805084..0000000000 --- a/services/platform/convex/website_page_embeddings/schema.ts +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Website Page Embeddings Schema - * - * Pre-defined multi-dimension vector tables for semantic search. - * Users select which dimension to use via the EMBEDDING_DIMENSIONS env var. - * Convex vectorIndex supports dimensions between 2 and 4096. - */ - -import { defineTable } from 'convex/server'; -import { v } from 'convex/values'; - -function defineEmbeddingTable(dimensions: number) { - return defineTable({ - organizationId: v.string(), - websiteId: v.id('websites'), - pageId: v.id('websitePages'), - embedding: v.array(v.float64()), - url: v.string(), - title: v.optional(v.string()), - contentHash: v.string(), - chunkIndex: v.number(), - chunkContent: v.string(), - createdAt: v.number(), - updatedAt: v.number(), - }) - .index('by_organizationId', ['organizationId']) - .index('by_pageId', ['pageId']) - .index('by_organizationId_and_pageId', ['organizationId', 'pageId']) - .searchIndex('by_content', { - searchField: 'chunkContent', - filterFields: ['organizationId', 'websiteId'], - }) - .vectorIndex('by_embedding', { - vectorField: 'embedding', - dimensions, - filterFields: ['organizationId', 'websiteId'], - }); -} - -export const websitePageEmbeddings256Table = defineEmbeddingTable(256); -export const websitePageEmbeddings512Table = defineEmbeddingTable(512); -export const websitePageEmbeddings1024Table = defineEmbeddingTable(1024); -export const websitePageEmbeddings1536Table = defineEmbeddingTable(1536); -export const websitePageEmbeddings2048Table = defineEmbeddingTable(2048); -export const websitePageEmbeddings2560Table = defineEmbeddingTable(2560); -export const websitePageEmbeddings4096Table = defineEmbeddingTable(4096); diff --git a/services/platform/convex/websites/actions.ts b/services/platform/convex/websites/actions.ts new file mode 100644 index 0000000000..edeeeb5233 --- /dev/null +++ b/services/platform/convex/websites/actions.ts @@ -0,0 +1,217 @@ +import { v } from 'convex/values'; + +import type { Id } from '../_generated/dataModel'; +import type { FetchPagesResult } from './types'; + +import { internal } from '../_generated/api'; +import { action } from '../_generated/server'; +import { authComponent } from '../auth'; +import { + registerDomainWithCrawler, + deregisterDomainFromCrawler, + fetchWebsiteInfo, +} from './internal_actions'; + +export const createWebsite = action({ + args: { + organizationId: v.string(), + domain: v.string(), + title: v.optional(v.string()), + description: v.optional(v.string()), + scanInterval: v.string(), + }, + returns: v.id('websites'), + handler: async (ctx, args): Promise> => { + const authUser = await authComponent.getAuthUser(ctx); + if (!authUser) throw new Error('Unauthenticated'); + + await ctx.runQuery( + internal.websites.internal_queries.verifyOrganizationMembership, + { + organizationId: args.organizationId, + userId: authUser._id, + email: authUser.email, + name: authUser.name, + }, + ); + + const websiteId = await ctx.runMutation( + internal.websites.internal_mutations.provisionWebsite, + { + organizationId: args.organizationId, + domain: args.domain, + title: args.title, + description: args.description, + scanInterval: args.scanInterval, + }, + ); + + // Register with crawler — wait for confirmation + try { + await registerDomainWithCrawler(args.domain, args.scanInterval); + } catch (e) { + await ctx.runMutation(internal.websites.internal_mutations.patchWebsite, { + websiteId, + status: 'error', + }); + throw e; + } + + // Sync page count + const info = await fetchWebsiteInfo(args.domain); + if (info?.page_count !== undefined) { + await ctx.runMutation(internal.websites.internal_mutations.patchWebsite, { + websiteId, + pageCount: info.page_count, + }); + } + + return websiteId; + }, +}); + +export const deleteWebsite = action({ + args: { + websiteId: v.id('websites'), + }, + returns: v.null(), + handler: async (ctx, args): Promise => { + const authUser = await authComponent.getAuthUser(ctx); + if (!authUser) throw new Error('Unauthenticated'); + + const website = await ctx.runQuery( + internal.websites.internal_queries.getWebsite, + { websiteId: args.websiteId }, + ); + if (!website) throw new Error('Website not found'); + + await ctx.runQuery( + internal.websites.internal_queries.verifyOrganizationMembership, + { + organizationId: website.organizationId, + userId: authUser._id, + email: authUser.email, + name: authUser.name, + }, + ); + + const domain = await ctx.runMutation( + internal.websites.internal_mutations.deleteWebsite, + { websiteId: args.websiteId }, + ); + + // Deregister from crawler + await deregisterDomainFromCrawler(domain); + + return null; + }, +}); + +export const rescanWebsite = action({ + args: { + websiteId: v.id('websites'), + }, + returns: v.null(), + handler: async (ctx, args): Promise => { + const authUser = await authComponent.getAuthUser(ctx); + if (!authUser) throw new Error('Unauthenticated'); + + const website = await ctx.runQuery( + internal.websites.internal_queries.getWebsite, + { websiteId: args.websiteId }, + ); + if (!website) throw new Error('Website not found'); + + await ctx.runQuery( + internal.websites.internal_queries.verifyOrganizationMembership, + { + organizationId: website.organizationId, + userId: authUser._id, + email: authUser.email, + name: authUser.name, + }, + ); + + const { domain, scanInterval } = await ctx.runMutation( + internal.websites.internal_mutations.rescanWebsite, + { websiteId: args.websiteId }, + ); + + // Trigger crawler rescan — re-registering triggers scan + try { + await registerDomainWithCrawler(domain, scanInterval); + } catch (e) { + await ctx.runMutation(internal.websites.internal_mutations.patchWebsite, { + websiteId: args.websiteId, + status: 'error', + }); + throw e; + } + + // Sync page count + const info = await fetchWebsiteInfo(domain); + if (info?.page_count !== undefined) { + await ctx.runMutation(internal.websites.internal_mutations.patchWebsite, { + websiteId: args.websiteId, + pageCount: info.page_count, + }); + } + + return null; + }, +}); + +export const syncStatuses = action({ + args: { + organizationId: v.string(), + }, + returns: v.null(), + handler: async (ctx, args): Promise => { + const authUser = await authComponent.getAuthUser(ctx); + if (!authUser) throw new Error('Unauthenticated'); + + await ctx.runQuery( + internal.websites.internal_queries.verifyOrganizationMembership, + { + organizationId: args.organizationId, + userId: authUser._id, + email: authUser.email, + name: authUser.name, + }, + ); + + await ctx.runAction( + internal.websites.internal_actions.syncWebsiteStatuses, + { organizationId: args.organizationId }, + ); + + return null; + }, +}); + +export const fetchPages = action({ + args: { + websiteId: v.id('websites'), + offset: v.optional(v.number()), + limit: v.optional(v.number()), + }, + handler: async (ctx, args): Promise => { + const authUser = await authComponent.getAuthUser(ctx); + if (!authUser) throw new Error('Unauthenticated'); + + const website = await ctx.runQuery( + internal.websites.internal_queries.getWebsite, + { websiteId: args.websiteId }, + ); + if (!website) throw new Error('Website not found'); + + return await ctx.runAction( + internal.websites.internal_actions.fetchWebsitePages, + { + domain: website.domain, + offset: args.offset, + limit: args.limit, + }, + ); + }, +}); diff --git a/services/platform/convex/websites/bulk_create_websites.ts b/services/platform/convex/websites/bulk_create_websites.ts index 7d2f92d1e8..92dbc2aefa 100644 --- a/services/platform/convex/websites/bulk_create_websites.ts +++ b/services/platform/convex/websites/bulk_create_websites.ts @@ -5,6 +5,8 @@ import type { MutationCtx } from '../_generated/server'; import type { BulkCreateWebsitesResult, BulkWebsiteData } from './types'; +import { ensureUrl } from './create_website'; + export interface BulkCreateWebsitesArgs { organizationId: string; websites: BulkWebsiteData[]; @@ -27,10 +29,6 @@ export async function bulkCreateWebsites( const websiteData = args.websites[i]; try { - const ensureUrl = (s: string) => - s.startsWith('http://') || s.startsWith('https://') - ? s - : `https://${s}`; const normalized = new URL(ensureUrl(websiteData.domain)).hostname; // Check for duplicates (normalized) diff --git a/services/platform/convex/websites/bulk_upsert_pages.ts b/services/platform/convex/websites/bulk_upsert_pages.ts deleted file mode 100644 index c2cbf36472..0000000000 --- a/services/platform/convex/websites/bulk_upsert_pages.ts +++ /dev/null @@ -1,162 +0,0 @@ -/** - * Bulk upsert website pages (create or update) - */ - -import type { Id } from '../_generated/dataModel'; -import type { MutationCtx } from '../_generated/server'; -import type { BulkUpsertPagesArgs, BulkUpsertPagesResult } from './types'; - -import { internal } from '../_generated/api'; -import { toId } from '../lib/type_cast_helpers'; -import { embeddingPool } from '../website_page_embeddings/embedding_pool'; - -export type { BulkUpsertPagesArgs, BulkUpsertPagesResult }; - -/** - * Bulk upsert website pages - * - Creates new pages if they don't exist - * - Updates existing pages if they do exist (based on URL) - */ -export async function bulkUpsertPages( - ctx: MutationCtx, - args: BulkUpsertPagesArgs, -): Promise { - const now = Date.now(); - - // Batch query all existing pages in parallel - const existingPages = await Promise.all( - args.pages.map((page) => - ctx.db - .query('websitePages') - .withIndex('by_organizationId_and_url', (q) => - q.eq('organizationId', args.organizationId).eq('url', page.url), - ) - .first(), - ), - ); - - // Build a map of URL -> existing page (ID + content for change detection) - const existingPageMap = new Map< - string, - { id: Id<'websitePages'>; content?: string } - >(); - for (let i = 0; i < args.pages.length; i++) { - const existing = existingPages[i]; - if (existing) { - existingPageMap.set(args.pages[i].url, { - id: existing._id, - content: existing.content, - }); - } - } - - // Separate pages into updates and inserts - const updates: Array<{ - id: Id<'websitePages'>; - page: (typeof args.pages)[0]; - contentChanged: boolean; - }> = []; - const inserts: Array<(typeof args.pages)[0]> = []; - - for (const page of args.pages) { - const existing = existingPageMap.get(page.url); - if (existing) { - updates.push({ - id: existing.id, - page, - contentChanged: page.content !== existing.content, - }); - } else { - inserts.push(page); - } - } - - const websiteId = toId<'websites'>(args.websiteId); - - // Execute updates in parallel — only patch fields that are explicitly provided - await Promise.all( - updates.map(({ id, page }) => { - const patch: Record = { - lastCrawledAt: now, - syncStatus: 'synced' as const, - }; - if (page.title !== undefined) patch.title = page.title; - if (page.content !== undefined) patch.content = page.content; - if (page.wordCount !== undefined) patch.wordCount = page.wordCount; - if (page.contentHash !== undefined) patch.contentHash = page.contentHash; - if (page.metadata !== undefined) patch.metadata = page.metadata; - if (page.structuredData !== undefined) - patch.structuredData = page.structuredData; - return ctx.db.patch(id, patch); - }), - ); - - // Execute inserts in parallel and collect new IDs - const insertedIds = await Promise.all( - inserts.map((page) => - ctx.db.insert('websitePages', { - organizationId: args.organizationId, - websiteId, - url: page.url, - title: page.title, - content: page.content, - wordCount: page.wordCount, - contentHash: page.contentHash, - lastCrawledAt: now, - metadata: page.metadata, - structuredData: page.structuredData, - syncStatus: 'synced' as const, - }), - ), - ); - - // Update page count on the website when new pages are inserted - if (inserts.length > 0) { - const website = await ctx.db.get(websiteId); - if (website) { - await ctx.db.patch(websiteId, { - pageCount: (website.pageCount ?? 0) + inserts.length, - }); - } - } - - // Schedule embedding generation for pages with content - const embeddingsEnabled = !!process.env.EMBEDDING_DIMENSIONS; - if (embeddingsEnabled) { - const pageIdsToEmbed: Id<'websitePages'>[] = []; - - // Updated pages with changed content only - for (const { id, page, contentChanged } of updates) { - if (page.content && contentChanged) pageIdsToEmbed.push(id); - } - - // Newly inserted pages with content - for (let i = 0; i < inserts.length; i++) { - if (inserts[i].content && insertedIds[i]) { - pageIdsToEmbed.push(insertedIds[i]); - } - } - - // Enqueue into a workpool (maxParallelism=1) so embedding jobs run - // one at a time with automatic retries. Without the pool, bulk scans - // would fire hundreds of concurrent embedding API calls across - // multiple batches, causing rate-limit failures and server instability. - for (const pageId of pageIdsToEmbed) { - await embeddingPool.enqueueAction( - ctx, - internal.website_page_embeddings.internal_actions.embedPage, - { - organizationId: args.organizationId, - websiteId, - pageId, - }, - ); - } - } - - return { - created: inserts.length, - updated: updates.length, - total: args.pages.length, - }; -} diff --git a/services/platform/convex/websites/cleanup_website.test.ts b/services/platform/convex/websites/cleanup_website.test.ts deleted file mode 100644 index ca595e442d..0000000000 --- a/services/platform/convex/websites/cleanup_website.test.ts +++ /dev/null @@ -1,138 +0,0 @@ -import { describe, expect, it, vi } from 'vitest'; - -import type { MutationCtx } from '../_generated/server'; - -import { cleanupWebsitePagesBatch } from './cleanup_website'; - -type MockId = string; - -function createMockPage(id: string) { - return { _id: id, websiteId: 'website_1' }; -} - -function createMockEmbedding(id: string) { - return { _id: id }; -} - -function asyncIterable(items: T[]) { - return { - [Symbol.asyncIterator]() { - let i = 0; - return { - async next() { - if (i < items.length) { - return { value: items[i++], done: false }; - } - return { value: undefined, done: true }; - }, - }; - }, - }; -} - -function createMockCtx( - pages: Array<{ _id: string }>, - embeddingsByPage: Record>, -) { - const deletedIds: string[] = []; - - const queryMock = vi.fn().mockImplementation((table: string) => { - if (table === 'websitePages') { - return { - withIndex: vi.fn().mockReturnValue(asyncIterable(pages)), - }; - } - return { - withIndex: vi - .fn() - .mockImplementation((_indexName: string, fn: Function) => { - const filter = { eq: vi.fn().mockReturnValue({ pageId: '' }) }; - fn(filter); - const pageId = filter.eq.mock.calls[0]?.[1]; - const embeddings = embeddingsByPage[pageId] ?? []; - return asyncIterable(embeddings); - }), - }; - }); - - const ctx = { - db: { - query: queryMock, - delete: vi.fn().mockImplementation(async (id: string) => { - deletedIds.push(id); - }), - }, - }; - - return { ctx, deletedIds }; -} - -describe('cleanupWebsitePagesBatch', () => { - it('returns hasMore: false when no pages exist', async () => { - const { ctx } = createMockCtx([], {}); - - const result = await cleanupWebsitePagesBatch( - ctx as unknown as MutationCtx, - 'website_1' as MockId as never, - ); - - expect(result).toEqual({ hasMore: false }); - }); - - it('deletes page and its embeddings', async () => { - const page = createMockPage('page_1'); - const embeddings = [ - createMockEmbedding('emb_1'), - createMockEmbedding('emb_2'), - ]; - - const { ctx, deletedIds } = createMockCtx([page], { page_1: embeddings }); - - const result = await cleanupWebsitePagesBatch( - ctx as unknown as MutationCtx, - 'website_1' as MockId as never, - ); - - expect(result).toEqual({ hasMore: false }); - expect(deletedIds).toContain('emb_1'); - expect(deletedIds).toContain('emb_2'); - expect(deletedIds).toContain('page_1'); - }); - - it('returns hasMore: true when batch size exceeded', async () => { - const pages = Array.from({ length: 3 }, (_, i) => - createMockPage(`page_${i}`), - ); - - const { ctx } = createMockCtx(pages, {}); - - const result = await cleanupWebsitePagesBatch( - ctx as unknown as MutationCtx, - 'website_1' as MockId as never, - ); - - expect(result).toEqual({ hasMore: true }); - expect(ctx.db.delete).toHaveBeenCalledTimes(2); - }); - - it('queries all 7 embedding tables per page', async () => { - const page = createMockPage('page_1'); - const { ctx } = createMockCtx([page], {}); - - await cleanupWebsitePagesBatch( - ctx as unknown as MutationCtx, - 'website_1' as MockId as never, - ); - - const queriedTables = ctx.db.query.mock.calls.map( - (call: string[]) => call[0], - ); - expect(queriedTables).toContain('websitePageEmbeddings256'); - expect(queriedTables).toContain('websitePageEmbeddings512'); - expect(queriedTables).toContain('websitePageEmbeddings1024'); - expect(queriedTables).toContain('websitePageEmbeddings1536'); - expect(queriedTables).toContain('websitePageEmbeddings2048'); - expect(queriedTables).toContain('websitePageEmbeddings2560'); - expect(queriedTables).toContain('websitePageEmbeddings4096'); - }); -}); diff --git a/services/platform/convex/websites/cleanup_website.ts b/services/platform/convex/websites/cleanup_website.ts deleted file mode 100644 index ef307485a1..0000000000 --- a/services/platform/convex/websites/cleanup_website.ts +++ /dev/null @@ -1,50 +0,0 @@ -/** - * Batch cleanup of website pages and their embeddings. - * - * Queries pages by websiteId, deletes their embeddings from all 7 dimension - * tables, then deletes the page records. Returns { hasMore } so the caller - * can self-reschedule for the next batch. - */ - -import type { Id } from '../_generated/dataModel'; -import type { MutationCtx } from '../_generated/server'; - -const PAGE_BATCH_SIZE = 2; - -const EMBEDDING_TABLES = [ - 'websitePageEmbeddings256', - 'websitePageEmbeddings512', - 'websitePageEmbeddings1024', - 'websitePageEmbeddings1536', - 'websitePageEmbeddings2048', - 'websitePageEmbeddings2560', - 'websitePageEmbeddings4096', -] as const; - -export async function cleanupWebsitePagesBatch( - ctx: MutationCtx, - websiteId: Id<'websites'>, -): Promise<{ hasMore: boolean }> { - let processedCount = 0; - - for await (const page of ctx.db - .query('websitePages') - .withIndex('by_websiteId', (q) => q.eq('websiteId', websiteId))) { - for (const table of EMBEDDING_TABLES) { - for await (const embedding of ctx.db - .query(table) - .withIndex('by_pageId', (q) => q.eq('pageId', page._id))) { - await ctx.db.delete(embedding._id); - } - } - - await ctx.db.delete(page._id); - processedCount++; - - if (processedCount >= PAGE_BATCH_SIZE) { - return { hasMore: true }; - } - } - - return { hasMore: false }; -} diff --git a/services/platform/convex/websites/create_website.ts b/services/platform/convex/websites/create_website.ts index 999eae1ba8..b1d5e4fc82 100644 --- a/services/platform/convex/websites/create_website.ts +++ b/services/platform/convex/websites/create_website.ts @@ -1,76 +1,35 @@ /** - * Create a new website and auto-attach a scheduled Website Scan workflow + * Create a new website record in the database. + * Does NOT register with the crawler — that's handled by the calling action. */ -import type { ConvexJsonRecord } from '../../lib/shared/schemas/utils/json-value'; import type { Id } from '../_generated/dataModel'; import type { MutationCtx } from '../_generated/server'; -import { internal } from '../_generated/api'; - export interface CreateWebsiteArgs { organizationId: string; domain: string; // Accepts full URL (preferred) or bare domain title?: string; description?: string; scanInterval: string; // e.g., '60m' | '6h' | '12h' | '1d' | '5d' | '7d' | '30d' - status?: 'active' | 'inactive' | 'error'; - metadata?: ConvexJsonRecord; + status?: 'idle' | 'scanning' | 'active' | 'error'; } -function toUrlAndDomain(input: string): { - websiteUrl: string; - websiteDomain: string; -} { - const ensureUrl = (s: string) => - s.startsWith('http://') || s.startsWith('https://') ? s : `https://${s}`; - try { - const u = new URL(ensureUrl(input)); - const url = `${u.protocol}//${u.host}${u.pathname || ''}`; - return { websiteUrl: url, websiteDomain: u.hostname }; - } catch { - // Fallback: treat as bare domain - const u = new URL(ensureUrl(input)); - return { websiteUrl: u.toString(), websiteDomain: u.hostname }; - } +export function ensureUrl(s: string) { + return s.startsWith('http://') || s.startsWith('https://') + ? s + : `https://${s}`; } -function _scanIntervalToCron(interval: string): { - schedule: string; - timezone: string; -} { - // Default timezone to UTC; can be made configurable later - const timezone = 'UTC'; - switch (interval) { - case '60m': - return { schedule: '0 * * * *', timezone }; - case '6h': - return { schedule: '0 */6 * * *', timezone }; - case '12h': - return { schedule: '0 */12 * * *', timezone }; - case '1d': - return { schedule: '0 2 * * *', timezone }; // Daily at 02:00 UTC - case '5d': - return { schedule: '0 2 */5 * *', timezone }; // Every 5 days at 02:00 UTC - case '7d': - return { schedule: '0 2 */7 * *', timezone }; // Every 7 days at 02:00 UTC - case '30d': - return { schedule: '0 2 1 * *', timezone }; // Monthly on the 1st at 02:00 UTC - default: - // Fallback to daily at 02:00 UTC - return { schedule: '0 2 * * *', timezone }; - } +export function toWebsiteDomain(input: string): string { + return new URL(ensureUrl(input)).hostname; } -/** - * Create a new website and automatically create + publish its scan workflow - */ export async function createWebsite( ctx: MutationCtx, args: CreateWebsiteArgs, ): Promise> { - // Normalize domain to bare hostname to avoid duplicates like "https://domain" vs "domain" - const { websiteDomain } = toUrlAndDomain(args.domain); + const websiteDomain = toWebsiteDomain(args.domain); // Prevent duplicates by organization + normalized domain const existingWebsite = await ctx.db @@ -84,24 +43,8 @@ export async function createWebsite( throw new Error(`Website with domain ${websiteDomain} already exists`); } - // Create website with normalized domain stored - const websiteId = await ctx.db.insert('websites', { + return await ctx.db.insert('websites', { ...args, domain: websiteDomain, }); - - // Provision the workflow asynchronously to avoid coupling errors here - await ctx.scheduler.runAfter( - 0, - internal.websites.internal_actions.provisionWebsiteScanWorkflow, - { - organizationId: args.organizationId, - websiteId, - domain: websiteDomain, - scanInterval: args.scanInterval, - autoTriggerInitialScan: true, - }, - ); - - return websiteId; } diff --git a/services/platform/convex/websites/delete_website.ts b/services/platform/convex/websites/delete_website.ts index 1871d45fb9..0b1b1e494b 100644 --- a/services/platform/convex/websites/delete_website.ts +++ b/services/platform/convex/websites/delete_website.ts @@ -1,57 +1,21 @@ /** - * Delete a website and cascade-cleanup all related resources. - * - * Synchronous (immediate): - * - Delete scan workflow (via shared deleteWorkflow helper, - * which also cleans triggers, executions, audit logs, etc.) - * - Delete the website record - * - * Asynchronous (scheduled): - * - Batch-delete all website pages and their embeddings + * Delete a website record from the database. + * Does NOT deregister from crawler — that's handled by the calling action. */ import type { Id } from '../_generated/dataModel'; import type { MutationCtx } from '../_generated/server'; -import { isRecord, getString } from '../../lib/utils/type-guards'; -import { internal } from '../_generated/api'; -import { toId } from '../lib/type_cast_helpers'; -import { deleteWorkflow } from '../workflows/definitions/delete_workflow'; - export async function deleteWebsite( ctx: MutationCtx, websiteId: Id<'websites'>, -): Promise { +): Promise { const website = await ctx.db.get(websiteId); if (!website) { throw new Error('Website not found'); } - const metadata = isRecord(website.metadata) ? website.metadata : undefined; - - const scanWorkflowId = metadata - ? getString(metadata, 'workflowId') - : undefined; - - if (scanWorkflowId) { - await deleteWorkflow(ctx, toId<'wfDefinitions'>(scanWorkflowId)); - } - - const domain = website.domain; - + const { domain } = website; await ctx.db.delete(websiteId); - - await ctx.scheduler.runAfter( - 0, - internal.websites.internal_mutations.batchCleanupWebsitePages, - { websiteId }, - ); - - await ctx.scheduler.runAfter( - 0, - internal.websites.internal_actions.deregisterWebsiteFromCrawler, - { domain }, - ); - - return null; + return domain; } diff --git a/services/platform/convex/websites/get_page_by_url.ts b/services/platform/convex/websites/get_page_by_url.ts deleted file mode 100644 index fa20518b7f..0000000000 --- a/services/platform/convex/websites/get_page_by_url.ts +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Get a single website page by URL within an organization - */ - -import type { Doc } from '../_generated/dataModel'; -import type { QueryCtx } from '../_generated/server'; - -export interface GetPageByUrlArgs { - organizationId: string; - url: string; -} - -/** - * Get a single website page by organizationId and URL. - * - * Uses the by_organizationId_and_url index on websitePages. - */ -export async function getPageByUrl( - ctx: QueryCtx, - args: GetPageByUrlArgs, -): Promise | null> { - const page = await ctx.db - .query('websitePages') - .withIndex('by_organizationId_and_url', (q) => - q.eq('organizationId', args.organizationId).eq('url', args.url), - ) - .unique(); - - return page ?? null; -} diff --git a/services/platform/convex/websites/get_pages_by_website.ts b/services/platform/convex/websites/get_pages_by_website.ts deleted file mode 100644 index 98933a9188..0000000000 --- a/services/platform/convex/websites/get_pages_by_website.ts +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Get all pages for a website - */ - -import type { Doc, Id } from '../_generated/dataModel'; -import type { QueryCtx } from '../_generated/server'; - -export interface GetPagesByWebsiteArgs { - websiteId: Id<'websites'>; - limit?: number; -} - -/** - * Get all pages for a website, ordered by last crawled time (newest first) - */ -export async function getPagesByWebsite( - ctx: QueryCtx, - args: GetPagesByWebsiteArgs, -): Promise>> { - const query = ctx.db - .query('websitePages') - .withIndex('by_websiteId_and_lastCrawledAt', (q) => - q.eq('websiteId', args.websiteId), - ) - .order('desc'); - - if (args.limit) { - return await query.take(args.limit); - } - - const pages: Array> = []; - for await (const page of query) { - pages.push(page); - } - return pages; -} diff --git a/services/platform/convex/websites/get_website_by_domain.ts b/services/platform/convex/websites/get_website_by_domain.ts index c4f261561e..54fbd0447d 100644 --- a/services/platform/convex/websites/get_website_by_domain.ts +++ b/services/platform/convex/websites/get_website_by_domain.ts @@ -5,6 +5,8 @@ import type { Doc } from '../_generated/dataModel'; import type { QueryCtx } from '../_generated/server'; +import { ensureUrl } from './create_website'; + export interface GetWebsiteByDomainArgs { organizationId: string; domain: string; @@ -17,8 +19,6 @@ export async function getWebsiteByDomain( ctx: QueryCtx, args: GetWebsiteByDomainArgs, ): Promise | null> { - const ensureUrl = (s: string) => - s.startsWith('http://') || s.startsWith('https://') ? s : `https://${s}`; const host = new URL(ensureUrl(args.domain)).hostname; // Try normalized hostname first diff --git a/services/platform/convex/websites/helpers.ts b/services/platform/convex/websites/helpers.ts index 5df39a8030..fbd3370b89 100644 --- a/services/platform/convex/websites/helpers.ts +++ b/services/platform/convex/websites/helpers.ts @@ -13,9 +13,3 @@ export * from './update_website'; export * from './delete_website'; export * from './rescan_website'; export * from './bulk_create_websites'; -export * from './provision_website_scan_workflow'; -export * from './bulk_upsert_pages'; -export * from './get_pages_by_website'; -export * from './get_page_by_url'; -export * from './register_urls'; -export * from './cleanup_website'; diff --git a/services/platform/convex/websites/internal_actions.ts b/services/platform/convex/websites/internal_actions.ts index 3fe2d51baf..022a18fac9 100644 --- a/services/platform/convex/websites/internal_actions.ts +++ b/services/platform/convex/websites/internal_actions.ts @@ -1,33 +1,181 @@ import { v } from 'convex/values'; +import type { Id } from '../_generated/dataModel'; +import type { CrawlerPagesResponse, CrawlerWebsiteInfo } from './types'; + +import { internal } from '../_generated/api'; import { internalAction } from '../_generated/server'; -import * as WebsitesHelpers from './helpers'; -export const provisionWebsiteScanWorkflow = internalAction({ +const CRAWLER_TIMEOUT_MS = 15_000; +const SYNC_INTERVAL_MS = 60 * 60 * 1000; // 1 hour + +function getCrawlerUrl() { + return process.env.CRAWLER_URL || 'http://localhost:8002'; +} + +function fetchWithTimeout( + url: string, + init?: RequestInit, + timeoutMs = CRAWLER_TIMEOUT_MS, +): Promise { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeoutMs); + return fetch(url, { ...init, signal: controller.signal }).finally(() => + clearTimeout(timer), + ); +} + +export function scanIntervalToSeconds(interval: string): number { + switch (interval) { + case '60m': + return 3600; + case '6h': + return 21600; + case '12h': + return 43200; + case '1d': + return 86400; + case '5d': + return 432000; + case '7d': + return 604800; + case '30d': + return 2592000; + default: + return 21600; + } +} + +export async function registerDomainWithCrawler( + domain: string, + scanInterval: string, +): Promise { + const crawlerUrl = getCrawlerUrl(); + const res = await fetchWithTimeout(`${crawlerUrl}/api/v1/websites`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + domain, + scan_interval: scanIntervalToSeconds(scanInterval), + }), + }); + if (!res.ok) { + throw new Error( + `Failed to register website with crawler: ${res.status} ${res.statusText}`, + ); + } +} + +export async function deregisterDomainFromCrawler( + domain: string, +): Promise { + const crawlerUrl = getCrawlerUrl(); + const res = await fetchWithTimeout( + `${crawlerUrl}/api/v1/websites/${domain}`, + { method: 'DELETE' }, + ); + if (!res.ok && res.status !== 404) { + throw new Error( + `Failed to deregister website from crawler: ${res.status} ${res.statusText}`, + ); + } +} + +export async function fetchWebsiteInfo( + domain: string, +): Promise { + const crawlerUrl = getCrawlerUrl(); + try { + const res = await fetchWithTimeout( + `${crawlerUrl}/api/v1/websites/${domain}`, + ); + if (res.ok) { + return await res.json(); + } + } catch { + // Non-fatal: website info will be synced on next operation + } + return null; +} + +interface WebsiteForSync { + _id: Id<'websites'>; + domain: string; + pageCount?: number; + metadata?: Record; +} + +export const syncWebsiteStatuses = internalAction({ args: { organizationId: v.string(), - websiteId: v.id('websites'), - domain: v.string(), - scanInterval: v.string(), - autoTriggerInitialScan: v.optional(v.boolean()), }, - handler: async (ctx, args) => { - return await WebsitesHelpers.provisionWebsiteScanWorkflow(ctx, args); + handler: async (ctx, args): Promise => { + const websites: WebsiteForSync[] = await ctx.runQuery( + internal.websites.internal_queries.listWebsitesForSync, + { organizationId: args.organizationId }, + ); + + const now = Date.now(); + + for (const website of websites) { + const lastSync = website.metadata?.lastStatusSyncAt; + if (typeof lastSync === 'number' && now - lastSync < SYNC_INTERVAL_MS) { + continue; + } + + try { + const websiteInfo = await fetchWebsiteInfo(website.domain); + + if (websiteInfo) { + await ctx.runMutation( + internal.websites.internal_mutations.patchWebsite, + { + websiteId: website._id, + metadata: { ...website.metadata, lastStatusSyncAt: now }, + status: websiteInfo.status, + pageCount: websiteInfo.page_count, + title: websiteInfo.title ?? undefined, + description: websiteInfo.description ?? undefined, + lastScannedAt: websiteInfo.last_scanned_at + ? new Date(websiteInfo.last_scanned_at).getTime() + : undefined, + }, + ); + } + // Only update lastStatusSyncAt on success — skip on failure so retry + // happens on the next sync cycle instead of waiting the full interval. + } catch { + console.warn(`Failed to sync status for ${website.domain}, will retry`); + } + } }, }); -export const deregisterWebsiteFromCrawler = internalAction({ +export const fetchWebsitePages = internalAction({ args: { domain: v.string(), + offset: v.optional(v.number()), + limit: v.optional(v.number()), }, handler: async (_ctx, args) => { - const crawlerUrl = process.env.CRAWLER_URL || 'http://localhost:8002'; - try { - await fetch(`${crawlerUrl}/api/v1/websites/${args.domain}`, { - method: 'DELETE', - }); - } catch (e) { - console.warn('Failed to deregister website from crawler:', e); + const crawlerUrl = getCrawlerUrl(); + const offset = args.offset ?? 0; + const limit = args.limit ?? 100; + + const res = await fetchWithTimeout( + `${crawlerUrl}/api/v1/pages/${args.domain}?offset=${offset}&limit=${limit}`, + ); + + if (!res.ok) { + throw new Error(`Crawler pages API returned ${res.status}`); } + + const data: CrawlerPagesResponse = await res.json(); + return { + pages: data.pages, + total: data.total, + offset: data.offset, + hasMore: data.has_more, + }; }, }); diff --git a/services/platform/convex/websites/internal_mutations.ts b/services/platform/convex/websites/internal_mutations.ts index 29d9109a71..efcc49010d 100644 --- a/services/platform/convex/websites/internal_mutations.ts +++ b/services/platform/convex/websites/internal_mutations.ts @@ -1,13 +1,13 @@ import { v } from 'convex/values'; import { jsonRecordValidator } from '../../lib/shared/schemas/utils/json-value'; -import { internal } from '../_generated/api'; import { internalMutation } from '../_generated/server'; import * as WebsitesHelpers from './helpers'; const websiteStatusValidator = v.union( + v.literal('idle'), + v.literal('scanning'), v.literal('active'), - v.literal('inactive'), v.literal('error'), ); @@ -26,76 +26,42 @@ export const provisionWebsite = internalMutation({ }, }); -export const patchWebsite = internalMutation({ +export const deleteWebsite = internalMutation({ args: { websiteId: v.id('websites'), - domain: v.optional(v.string()), - title: v.optional(v.string()), - description: v.optional(v.string()), - scanInterval: v.optional(v.string()), - lastScannedAt: v.optional(v.number()), - status: v.optional(websiteStatusValidator), - metadata: v.optional(jsonRecordValidator), - }, - handler: async (ctx, args) => { - return await WebsitesHelpers.updateWebsite(ctx, args); - }, -}); - -export const bulkUpsertPages = internalMutation({ - args: { - organizationId: v.string(), - websiteId: v.string(), - pages: v.array( - v.object({ - url: v.string(), - title: v.optional(v.string()), - content: v.optional(v.string()), - wordCount: v.optional(v.number()), - contentHash: v.optional(v.string()), - metadata: v.optional(jsonRecordValidator), - structuredData: v.optional(jsonRecordValidator), - }), - ), }, + returns: v.string(), handler: async (ctx, args) => { - return await WebsitesHelpers.bulkUpsertPages(ctx, args); + return await WebsitesHelpers.deleteWebsite(ctx, args.websiteId); }, }); -export const registerUrls = internalMutation({ +export const rescanWebsite = internalMutation({ args: { - organizationId: v.string(), - websiteId: v.string(), - urls: v.array( - v.object({ - url: v.string(), - contentHash: v.optional(v.string()), - status: v.optional(v.string()), - }), - ), + websiteId: v.id('websites'), }, + returns: v.object({ + domain: v.string(), + scanInterval: v.string(), + }), handler: async (ctx, args) => { - return await WebsitesHelpers.registerUrls(ctx, args); + return await WebsitesHelpers.rescanWebsite(ctx, args.websiteId); }, }); -export const batchCleanupWebsitePages = internalMutation({ +export const patchWebsite = internalMutation({ args: { websiteId: v.id('websites'), + domain: v.optional(v.string()), + title: v.optional(v.string()), + description: v.optional(v.string()), + scanInterval: v.optional(v.string()), + lastScannedAt: v.optional(v.number()), + status: v.optional(websiteStatusValidator), + pageCount: v.optional(v.number()), + metadata: v.optional(jsonRecordValidator), }, handler: async (ctx, args) => { - const result = await WebsitesHelpers.cleanupWebsitePagesBatch( - ctx, - args.websiteId, - ); - - if (result.hasMore) { - await ctx.scheduler.runAfter( - 0, - internal.websites.internal_mutations.batchCleanupWebsitePages, - { websiteId: args.websiteId }, - ); - } + return await WebsitesHelpers.updateWebsite(ctx, args); }, }); diff --git a/services/platform/convex/websites/internal_queries.ts b/services/platform/convex/websites/internal_queries.ts index 7ce2ddb2ec..1a78ef035d 100644 --- a/services/platform/convex/websites/internal_queries.ts +++ b/services/platform/convex/websites/internal_queries.ts @@ -1,6 +1,7 @@ import { v } from 'convex/values'; import { internalQuery } from '../_generated/server'; +import { getOrganizationMember } from '../lib/rls'; import * as WebsitesHelpers from './helpers'; export const getWebsite = internalQuery({ @@ -12,22 +13,50 @@ export const getWebsite = internalQuery({ }, }); -export const getWebsiteByDomain = internalQuery({ +export const verifyOrganizationMembership = internalQuery({ args: { organizationId: v.string(), - domain: v.string(), + userId: v.string(), + email: v.string(), + name: v.string(), }, handler: async (ctx, args) => { - return await WebsitesHelpers.getWebsiteByDomain(ctx, args); + await getOrganizationMember(ctx, args.organizationId, { + userId: args.userId, + email: args.email, + name: args.name, + }); }, }); -export const getWebsitePageByUrl = internalQuery({ +export const listWebsitesForSync = internalQuery({ args: { organizationId: v.string(), - url: v.string(), }, handler: async (ctx, args) => { - return await WebsitesHelpers.getPageByUrl(ctx, args); + const results = []; + for await (const website of ctx.db + .query('websites') + .withIndex('by_organizationId', (q) => + q.eq('organizationId', args.organizationId), + )) { + results.push({ + _id: website._id, + domain: website.domain, + pageCount: website.pageCount, + metadata: website.metadata, + }); + } + return results; + }, +}); + +export const getWebsiteByDomain = internalQuery({ + args: { + organizationId: v.string(), + domain: v.string(), + }, + handler: async (ctx, args) => { + return await WebsitesHelpers.getWebsiteByDomain(ctx, args); }, }); diff --git a/services/platform/convex/websites/list_website_pages_paginated.test.ts b/services/platform/convex/websites/list_website_pages_paginated.test.ts deleted file mode 100644 index 8dd0ad4c72..0000000000 --- a/services/platform/convex/websites/list_website_pages_paginated.test.ts +++ /dev/null @@ -1,91 +0,0 @@ -import { describe, expect, it, vi } from 'vitest'; - -import type { QueryCtx } from '../_generated/server'; - -import { listWebsitePagesPaginated } from './list_website_pages_paginated'; - -function createMockQueryBuilder( - documents: Array> = [], -) { - const paginateResult = { - page: documents, - isDone: true, - continueCursor: documents.length > 0 ? 'cursor_1' : '', - }; - - const builder = { - withIndex: vi.fn().mockReturnThis(), - order: vi.fn().mockReturnThis(), - paginate: vi.fn().mockResolvedValue(paginateResult), - }; - - const ctx = { - db: { - query: vi.fn().mockReturnValue(builder), - }, - }; - - return { ctx, builder, paginateResult }; -} - -const DEFAULT_PAGINATION_OPTS = { numItems: 10, cursor: null, id: 0 }; - -describe('listWebsitePagesPaginated', () => { - it('queries websitePages with by_websiteId_and_lastCrawledAt index', async () => { - const { ctx, builder } = createMockQueryBuilder(); - - await listWebsitePagesPaginated(ctx as unknown as QueryCtx, { - paginationOpts: DEFAULT_PAGINATION_OPTS, - websiteId: 'website_1' as never, - }); - - expect(ctx.db.query).toHaveBeenCalledWith('websitePages'); - expect(builder.withIndex).toHaveBeenCalledWith( - 'by_websiteId_and_lastCrawledAt', - expect.any(Function), - ); - expect(builder.order).toHaveBeenCalledWith('desc'); - expect(builder.paginate).toHaveBeenCalledWith(DEFAULT_PAGINATION_OPTS); - }); - - it('returns pagination result with pages', async () => { - const docs = [ - { _id: 'p_1', url: 'https://example.com/page1', content: '# Hello' }, - { _id: 'p_2', url: 'https://example.com/page2', content: '# World' }, - ]; - const { ctx, paginateResult } = createMockQueryBuilder(docs); - - const result = await listWebsitePagesPaginated(ctx as unknown as QueryCtx, { - paginationOpts: DEFAULT_PAGINATION_OPTS, - websiteId: 'website_1' as never, - }); - - expect(result).toBe(paginateResult); - expect(result.page).toHaveLength(2); - }); - - it('returns empty result when no pages exist', async () => { - const { ctx, paginateResult } = createMockQueryBuilder([]); - - const result = await listWebsitePagesPaginated(ctx as unknown as QueryCtx, { - paginationOpts: DEFAULT_PAGINATION_OPTS, - websiteId: 'website_1' as never, - }); - - expect(result).toBe(paginateResult); - expect(result.page).toHaveLength(0); - expect(result.isDone).toBe(true); - }); - - it('passes paginationOpts through to paginate', async () => { - const { ctx, builder } = createMockQueryBuilder(); - const opts = { numItems: 50, cursor: 'abc123', id: 3 }; - - await listWebsitePagesPaginated(ctx as unknown as QueryCtx, { - paginationOpts: opts, - websiteId: 'website_1' as never, - }); - - expect(builder.paginate).toHaveBeenCalledWith(opts); - }); -}); diff --git a/services/platform/convex/websites/list_website_pages_paginated.ts b/services/platform/convex/websites/list_website_pages_paginated.ts deleted file mode 100644 index de2f618941..0000000000 --- a/services/platform/convex/websites/list_website_pages_paginated.ts +++ /dev/null @@ -1,28 +0,0 @@ -/** - * List website pages using Convex native .paginate() for use with usePaginatedQuery. - * - * Returns pages for a given website, ordered by lastCrawledAt descending. - */ - -import type { PaginationOptions, PaginationResult } from 'convex/server'; - -import type { Doc, Id } from '../_generated/dataModel'; -import type { QueryCtx } from '../_generated/server'; - -interface ListWebsitePagesPaginatedArgs { - paginationOpts: PaginationOptions; - websiteId: Id<'websites'>; -} - -export async function listWebsitePagesPaginated( - ctx: QueryCtx, - args: ListWebsitePagesPaginatedArgs, -): Promise>> { - return await ctx.db - .query('websitePages') - .withIndex('by_websiteId_and_lastCrawledAt', (q) => - q.eq('websiteId', args.websiteId), - ) - .order('desc') - .paginate(args.paginationOpts); -} diff --git a/services/platform/convex/websites/mutations.ts b/services/platform/convex/websites/mutations.ts index 00bc2cb8f8..4aca2c040b 100644 --- a/services/platform/convex/websites/mutations.ts +++ b/services/platform/convex/websites/mutations.ts @@ -6,36 +6,12 @@ import { getOrganizationMember } from '../lib/rls'; import * as WebsitesHelpers from './helpers'; const websiteStatusValidator = v.union( + v.literal('idle'), + v.literal('scanning'), v.literal('active'), - v.literal('inactive'), v.literal('error'), ); -export const createWebsite = mutation({ - args: { - organizationId: v.string(), - domain: v.string(), - title: v.optional(v.string()), - description: v.optional(v.string()), - scanInterval: v.string(), - }, - returns: v.id('websites'), - handler: async (ctx, args) => { - const authUser = await authComponent.getAuthUser(ctx); - if (!authUser) { - throw new Error('Unauthenticated'); - } - - await getOrganizationMember(ctx, args.organizationId, { - userId: authUser._id, - email: authUser.email, - name: authUser.name, - }); - - return await WebsitesHelpers.createWebsite(ctx, args); - }, -}); - export const updateWebsite = mutation({ args: { websiteId: v.id('websites'), @@ -67,57 +43,3 @@ export const updateWebsite = mutation({ return null; }, }); - -export const deleteWebsite = mutation({ - args: { - websiteId: v.id('websites'), - }, - returns: v.null(), - handler: async (ctx, args) => { - const authUser = await authComponent.getAuthUser(ctx); - if (!authUser) { - throw new Error('Unauthenticated'); - } - - const website = await ctx.db.get(args.websiteId); - if (!website) { - throw new Error('Website not found'); - } - - await getOrganizationMember(ctx, website.organizationId, { - userId: authUser._id, - email: authUser.email, - name: authUser.name, - }); - - await WebsitesHelpers.deleteWebsite(ctx, args.websiteId); - return null; - }, -}); - -export const rescanWebsite = mutation({ - args: { - websiteId: v.id('websites'), - }, - returns: v.null(), - handler: async (ctx, args) => { - const authUser = await authComponent.getAuthUser(ctx); - if (!authUser) { - throw new Error('Unauthenticated'); - } - - const website = await ctx.db.get(args.websiteId); - if (!website) { - throw new Error('Website not found'); - } - - await getOrganizationMember(ctx, website.organizationId, { - userId: authUser._id, - email: authUser.email, - name: authUser.name, - }); - - await WebsitesHelpers.rescanWebsite(ctx, args.websiteId); - return null; - }, -}); diff --git a/services/platform/convex/websites/provision_website_scan_workflow.ts b/services/platform/convex/websites/provision_website_scan_workflow.ts deleted file mode 100644 index a5fc9032a1..0000000000 --- a/services/platform/convex/websites/provision_website_scan_workflow.ts +++ /dev/null @@ -1,205 +0,0 @@ -/** - * Provision and publish a Website Scan workflow for a website. - * - * Model-layer helper invoked by internal.websites.internal_mutations.provisionWebsiteScanWorkflow. - */ - -import type { Id } from '../_generated/dataModel'; -import type { ActionCtx } from '../_generated/server'; - -import { isRecord } from '../../lib/utils/type-guards'; -import { internal } from '../_generated/api'; -import { toConvexJsonRecord } from '../lib/type_cast_helpers'; -import websiteScanWorkflow from '../predefined_workflows/website_scan'; -import { toPredefinedWorkflowPayload } from '../workflows/definitions/types'; - -export interface ProvisionWebsiteScanWorkflowArgs { - organizationId: string; - websiteId: Id<'websites'>; - domain: string; - scanInterval: string; - autoTriggerInitialScan?: boolean; -} - -function scanIntervalToCron(interval: string): { - schedule: string; - timezone: string; -} { - const timezone = 'UTC'; - switch (interval) { - case '60m': - return { schedule: '0 * * * *', timezone }; - case '6h': - return { schedule: '0 */6 * * *', timezone }; - case '12h': - return { schedule: '0 */12 * * *', timezone }; - case '1d': - return { schedule: '0 2 * * *', timezone }; - case '5d': - return { schedule: '0 2 */5 * *', timezone }; - case '7d': - return { schedule: '0 2 */7 * *', timezone }; - case '30d': - return { schedule: '0 2 1 * *', timezone }; - default: - return { schedule: '0 2 * * *', timezone }; - } -} - -function scanIntervalToSeconds(interval: string): number { - switch (interval) { - case '60m': - return 3600; - case '6h': - return 21600; - case '12h': - return 43200; - case '1d': - return 86400; - case '5d': - return 432000; - case '7d': - return 604800; - case '30d': - return 2592000; - default: - return 21600; - } -} - -export async function provisionWebsiteScanWorkflow( - ctx: ActionCtx, - args: ProvisionWebsiteScanWorkflowArgs, -): Promise { - const ensureUrl = (s: string) => - s.startsWith('http://') || s.startsWith('https://') ? s : `https://${s}`; - - const u = new URL(ensureUrl(args.domain)); - const websiteUrl = `${u.protocol}//${u.host}${u.pathname || ''}`; - const websiteDomain = u.hostname; - - // Register website with crawler service for autonomous background scanning - const crawlerUrl = process.env.CRAWLER_URL || 'http://localhost:8002'; - try { - await fetch(`${crawlerUrl}/api/v1/websites`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - domain: websiteDomain, - scan_interval: scanIntervalToSeconds(args.scanInterval), - }), - }); - } catch (e) { - // Non-fatal: crawler registration can be retried on next scan - console.warn('Failed to register website with crawler:', e); - } - - const { schedule, timezone } = scanIntervalToCron(args.scanInterval); - - const rawVars = websiteScanWorkflow.workflowConfig.config?.variables; - const templateVars = isRecord(rawVars) ? rawVars : {}; - - const variables = toConvexJsonRecord({ - ...templateVars, - organizationId: args.organizationId, - websiteId: args.websiteId, - websiteUrl, - websiteDomain, - scanInterval: args.scanInterval, - }); - - const workflowName = `Website Scan - ${websiteDomain}`; - - // Use toPredefinedWorkflowPayload to handle type bridging from loose predefined workflow types - const payload = toPredefinedWorkflowPayload( - websiteScanWorkflow, - { - name: workflowName, - config: { - ...websiteScanWorkflow.workflowConfig.config, - variables, - }, - }, - (step) => - step.stepType === 'start' || step.stepType === 'trigger' - ? { - ...step, - config: { - ...(isRecord(step.config) ? step.config : {}), - type: 'scheduled', - schedule, - timezone, - }, - } - : step, - ); - - const saved = await ctx.runMutation( - internal.wf_definitions.internal_mutations.provisionWorkflowWithSteps, - { - organizationId: args.organizationId, - ...payload, - }, - ); - - // Newly created workflows start as drafts; publish immediately. - await ctx.runMutation( - internal.wf_definitions.internal_mutations.provisionPublishDraft, - { - wfDefinitionId: saved.workflowId, - publishedBy: 'system', - changeLog: 'Auto-created and published from website creation', - }, - ); - - // Register the schedule so the cron scanner picks up this workflow - await ctx.runMutation( - internal.workflows.triggers.internal_mutations.provisionSchedule, - { - organizationId: args.organizationId, - workflowRootId: saved.workflowId, - cronExpression: schedule, - timezone, - createdBy: 'system', - }, - ); - - const current = await ctx.runQuery( - internal.websites.internal_queries.getWebsite, - { - websiteId: args.websiteId, - }, - ); - const existingMeta = isRecord(current?.metadata) ? current.metadata : {}; - - await ctx.runMutation(internal.websites.internal_mutations.patchWebsite, { - websiteId: args.websiteId, - metadata: { - ...existingMeta, - workflowId: saved.workflowId, - }, - }); - - if (args.autoTriggerInitialScan === true) { - await ctx.runMutation(internal.websites.internal_mutations.patchWebsite, { - websiteId: args.websiteId, - lastScannedAt: Date.now(), - }); - - await ctx.scheduler.runAfter( - 300000, - internal.workflow_engine.internal_mutations.startWorkflow, - { - organizationId: args.organizationId, - wfDefinitionId: saved.workflowId, - input: { websiteId: args.websiteId, domain: websiteDomain }, - triggeredBy: 'system', - triggerData: { - triggerType: 'system', - reason: 'initial_website_scan', - timestamp: Date.now(), - }, - }, - ); - } -} diff --git a/services/platform/convex/websites/queries.ts b/services/platform/convex/websites/queries.ts index 916050f7d4..fb217473dc 100644 --- a/services/platform/convex/websites/queries.ts +++ b/services/platform/convex/websites/queries.ts @@ -3,7 +3,6 @@ import { v } from 'convex/values'; import { countItemsInOrg } from '../lib/helpers/count_items_in_org'; import { queryWithRLS } from '../lib/rls'; -import { listWebsitePagesPaginated as listWebsitePagesPaginatedHelper } from './list_website_pages_paginated'; import { listWebsitesPaginated as listWebsitesPaginatedHelper } from './list_websites_paginated'; import { websiteValidator } from './validators'; @@ -45,13 +44,3 @@ export const listWebsitesPaginated = queryWithRLS({ return await listWebsitesPaginatedHelper(ctx, args); }, }); - -export const listWebsitePagesPaginated = queryWithRLS({ - args: { - paginationOpts: paginationOptsValidator, - websiteId: v.id('websites'), - }, - handler: async (ctx, args) => { - return await listWebsitePagesPaginatedHelper(ctx, args); - }, -}); diff --git a/services/platform/convex/websites/register_urls.ts b/services/platform/convex/websites/register_urls.ts deleted file mode 100644 index 3e46277a69..0000000000 --- a/services/platform/convex/websites/register_urls.ts +++ /dev/null @@ -1,119 +0,0 @@ -import type { MutationCtx } from '../_generated/server'; - -import { toId } from '../lib/type_cast_helpers'; - -const EMBEDDING_TABLES = [ - 'websitePageEmbeddings256', - 'websitePageEmbeddings512', - 'websitePageEmbeddings1024', - 'websitePageEmbeddings1536', - 'websitePageEmbeddings2048', - 'websitePageEmbeddings2560', - 'websitePageEmbeddings4096', -] as const; - -export interface UrlEntry { - url: string; - contentHash?: string; - status?: string; -} - -export interface RegisterUrlsArgs { - organizationId: string; - websiteId: string; - urls: UrlEntry[]; -} - -export interface RegisterUrlsResult { - registered: number; - updated: number; - deleted: number; - skipped: number; - total: number; - urlsToSync: string[]; -} - -export async function registerUrls( - ctx: MutationCtx, - args: RegisterUrlsArgs, -): Promise { - const now = Date.now(); - const websiteId = toId<'websites'>(args.websiteId); - let registered = 0; - let updated = 0; - let deleted = 0; - let skipped = 0; - const urlsToSync: string[] = []; - - for (const entry of args.urls) { - const existing = await ctx.db - .query('websitePages') - .withIndex('by_organizationId_and_url', (q) => - q.eq('organizationId', args.organizationId).eq('url', entry.url), - ) - .first(); - - // Handle deleted URLs — remove page + embeddings - if (entry.status === 'deleted') { - if (existing) { - for (const table of EMBEDDING_TABLES) { - for await (const embedding of ctx.db - .query(table) - .withIndex('by_pageId', (q) => q.eq('pageId', existing._id))) { - await ctx.db.delete(embedding._id); - } - } - await ctx.db.delete(existing._id); - deleted++; - } - continue; - } - - if (existing) { - // Existing URL — check if hash changed - if (entry.contentHash && entry.contentHash !== existing.contentHash) { - await ctx.db.patch(existing._id, { - contentHash: entry.contentHash, - syncStatus: 'pending', - }); - updated++; - urlsToSync.push(entry.url); - } else { - skipped++; - } - continue; - } - - // New URL — insert as pending - await ctx.db.insert('websitePages', { - organizationId: args.organizationId, - websiteId, - url: entry.url, - contentHash: entry.contentHash, - lastCrawledAt: now, - syncStatus: 'pending', - }); - registered++; - urlsToSync.push(entry.url); - } - - // Update page count: new pages added minus pages deleted - const netChange = registered - deleted; - if (netChange !== 0) { - const website = await ctx.db.get(websiteId); - if (website) { - await ctx.db.patch(websiteId, { - pageCount: Math.max(0, (website.pageCount ?? 0) + netChange), - }); - } - } - - return { - registered, - updated, - deleted, - skipped, - total: args.urls.length, - urlsToSync, - }; -} diff --git a/services/platform/convex/websites/rescan_website.ts b/services/platform/convex/websites/rescan_website.ts index 55f917861d..e8cdc8e619 100644 --- a/services/platform/convex/websites/rescan_website.ts +++ b/services/platform/convex/websites/rescan_website.ts @@ -1,32 +1,28 @@ /** - * Trigger a manual rescan of a website + * Mark a website as rescanning and normalize its domain. + * Does NOT call crawler — that's handled by the calling action. */ -import type { Id, Doc } from '../_generated/dataModel'; +import type { Id } from '../_generated/dataModel'; import type { MutationCtx } from '../_generated/server'; -import { internal } from '../_generated/api'; +import { toWebsiteDomain } from './create_website'; + +export interface RescanWebsiteResult { + domain: string; + scanInterval: string; +} -/** - * Trigger a manual rescan of a website - * - * - Finds the attached workflow (by metadata.workflowId, else by naming convention) - * - Starts the workflow immediately as a manual run - * - Updates website status and lastScannedAt optimistically - */ export async function rescanWebsite( ctx: MutationCtx, websiteId: Id<'websites'>, -): Promise | null> { +): Promise { const website = await ctx.db.get(websiteId); if (!website) { throw new Error('Website not found'); } - // Normalize domain for consistency - const ensureUrl = (s: string) => - s.startsWith('http://') || s.startsWith('https://') ? s : `https://${s}`; - const normalizedDomain = new URL(ensureUrl(website.domain)).hostname; + const normalizedDomain = toWebsiteDomain(website.domain); // If stored domain includes protocol/path, normalize it when safe (no conflict) if (normalizedDomain !== website.domain) { @@ -43,35 +39,14 @@ export async function rescanWebsite( } } - // Resolve the attached workflow id from metadata only. - const metadata = website.metadata ?? {}; - const workflowId: Id<'wfDefinitions'> | undefined = metadata['workflowId']; - - if (!workflowId) { - throw new Error('Attached workflowId missing from website metadata'); - } - - // Start the workflow immediately using the engine executor directly - await ctx.runMutation( - internal.workflow_engine.internal_mutations.startWorkflow, - { - organizationId: website.organizationId, - wfDefinitionId: workflowId, - input: { websiteId: website._id, domain: normalizedDomain }, - triggeredBy: 'manual', - triggerData: { - triggerType: 'manual', - reason: 'rescan', - timestamp: Date.now(), - }, - }, - ); - // Optimistically update the last scanned timestamp/status await ctx.db.patch(websiteId, { lastScannedAt: Date.now(), status: 'active', }); - return await ctx.db.get(websiteId); + return { + domain: normalizedDomain, + scanInterval: website.scanInterval, + }; } diff --git a/services/platform/convex/websites/schema.ts b/services/platform/convex/websites/schema.ts index 81941fa2ea..e41260ff3f 100644 --- a/services/platform/convex/websites/schema.ts +++ b/services/platform/convex/websites/schema.ts @@ -11,7 +11,12 @@ export const websitesTable = defineTable({ scanInterval: v.string(), lastScannedAt: v.optional(v.number()), status: v.optional( - v.union(v.literal('active'), v.literal('inactive'), v.literal('error')), + v.union( + v.literal('idle'), + v.literal('scanning'), + v.literal('active'), + v.literal('error'), + ), ), pageCount: v.optional(v.number()), metadata: v.optional(jsonRecordValidator), @@ -19,22 +24,3 @@ export const websitesTable = defineTable({ .index('by_organizationId', ['organizationId']) .index('by_organizationId_and_status', ['organizationId', 'status']) .index('by_organizationId_and_domain', ['organizationId', 'domain']); - -export const websitePagesTable = defineTable({ - organizationId: v.string(), - websiteId: v.id('websites'), - url: v.string(), - title: v.optional(v.string()), - content: v.optional(v.string()), - wordCount: v.optional(v.number()), - lastCrawledAt: v.number(), - metadata: v.optional(jsonRecordValidator), - structuredData: v.optional(jsonRecordValidator), - contentHash: v.optional(v.string()), - syncStatus: v.optional(v.union(v.literal('pending'), v.literal('synced'))), -}) - .index('by_organizationId', ['organizationId']) - .index('by_websiteId', ['websiteId']) - .index('by_websiteId_and_lastCrawledAt', ['websiteId', 'lastCrawledAt']) - .index('by_organizationId_and_url', ['organizationId', 'url']) - .index('by_websiteId_and_syncStatus', ['websiteId', 'syncStatus']); diff --git a/services/platform/convex/websites/types.ts b/services/platform/convex/websites/types.ts index d23b9c588a..53d7025021 100644 --- a/services/platform/convex/websites/types.ts +++ b/services/platform/convex/websites/types.ts @@ -4,14 +4,9 @@ import type { Infer } from 'convex/values'; -import type { ConvexJsonRecord } from '../../lib/shared/schemas/utils/json-value'; import type { Id } from '../_generated/dataModel'; -import { - websitePageValidator, - websiteStatusValidator, - websiteValidator, -} from './validators'; +import { websiteStatusValidator, websiteValidator } from './validators'; // ============================================================================= // INFERRED TYPES (from validators) @@ -19,7 +14,6 @@ import { export type WebsiteStatus = Infer; export type Website = Infer; -export type WebsitePage = Infer; // ============================================================================= // MANUAL TYPES (no corresponding validator) @@ -70,28 +64,42 @@ export interface BulkWebsiteData { metadata?: Record; } -/** - * Args for bulk upserting website pages - */ -export interface BulkUpsertPagesArgs { - organizationId: string; - websiteId: string; - pages: Array<{ - url: string; - title?: string; - content?: string; - wordCount?: number; - contentHash?: string; - metadata?: ConvexJsonRecord; - structuredData?: ConvexJsonRecord; - }>; +// ============================================================================= +// CRAWLER SERVICE TYPES +// ============================================================================= + +export interface CrawlerPage { + url: string; + title: string | null; + word_count: number; + status: string; + content_hash: string | null; + last_crawled_at: string | null; + discovered_at: string | null; + chunks_count: number; + indexed: boolean; } -/** - * Result from bulk upserting website pages - */ -export interface BulkUpsertPagesResult { - created: number; - updated: number; +export interface CrawlerWebsiteInfo { + domain: string; + title: string | null; + description: string | null; + page_count: number; + status: WebsiteStatus; + last_scanned_at: string | null; +} + +export interface CrawlerPagesResponse { + domain: string; + pages: CrawlerPage[]; + total: number; + offset: number; + has_more: boolean; +} + +export interface FetchPagesResult { + pages: CrawlerPage[]; total: number; + offset: number; + hasMore: boolean; } diff --git a/services/platform/convex/websites/update_website.ts b/services/platform/convex/websites/update_website.ts index 89399ac23d..140579ab9b 100644 --- a/services/platform/convex/websites/update_website.ts +++ b/services/platform/convex/websites/update_website.ts @@ -5,6 +5,8 @@ import type { Id, Doc } from '../_generated/dataModel'; import type { MutationCtx } from '../_generated/server'; +import { ensureUrl } from './create_website'; + export interface UpdateWebsiteArgs { websiteId: Id<'websites'>; domain?: string; @@ -12,7 +14,7 @@ export interface UpdateWebsiteArgs { description?: string; scanInterval?: string; lastScannedAt?: number; - status?: 'active' | 'inactive' | 'error'; + status?: 'idle' | 'scanning' | 'active' | 'error'; metadata?: unknown; } @@ -33,8 +35,6 @@ export async function updateWebsite( // If domain provided, normalize to bare hostname and check for conflicts if (updateData.domain) { - const ensureUrl = (s: string) => - s.startsWith('http://') || s.startsWith('https://') ? s : `https://${s}`; const normalized = new URL(ensureUrl(updateData.domain)).hostname; updateData.domain = normalized; diff --git a/services/platform/convex/websites/validators.ts b/services/platform/convex/websites/validators.ts index 1c0e9b297d..c06d3e3f32 100644 --- a/services/platform/convex/websites/validators.ts +++ b/services/platform/convex/websites/validators.ts @@ -14,7 +14,6 @@ import { websiteStatusSchema } from '../../lib/shared/schemas/websites'; export { websiteStatusSchema, websiteSchema, - websitePageSchema, } from '../../lib/shared/schemas/websites'; // Simple schemas without z.lazy() @@ -34,18 +33,3 @@ export const websiteValidator = v.object({ pageCount: v.optional(v.number()), metadata: v.optional(jsonRecordValidator), }); - -export const websitePageValidator = v.object({ - _id: v.string(), - _creationTime: v.number(), - organizationId: v.string(), - websiteId: v.string(), - url: v.string(), - title: v.optional(v.string()), - content: v.optional(v.string()), - wordCount: v.optional(v.number()), - lastCrawledAt: v.number(), - metadata: v.optional(jsonRecordValidator), - structuredData: v.optional(jsonRecordValidator), - syncStatus: v.optional(v.union(v.literal('pending'), v.literal('synced'))), -}); diff --git a/services/platform/convex/workflow_engine/action_defs/action_registry.ts b/services/platform/convex/workflow_engine/action_defs/action_registry.ts index 4bb4ed610c..f376bd3646 100644 --- a/services/platform/convex/workflow_engine/action_defs/action_registry.ts +++ b/services/platform/convex/workflow_engine/action_defs/action_registry.ts @@ -11,7 +11,6 @@ import { productAction } from './product/product_action'; import { ragAction } from './rag/rag_action'; import { setVariablesAction } from './set_variables_action'; import { websiteAction } from './website/website_action'; -import { websitePagesAction } from './website_pages/website_pages_action'; import { workflowAction } from './workflow/workflow_action'; import { workflowProcessingRecordsAction } from './workflow_processing_records/workflow_processing_records_action'; @@ -48,7 +47,6 @@ export const ACTIONS: AnyActionDefinition[] = [ onedriveAction, crawlerAction, websiteAction, - websitePagesAction, workflowAction, ]; diff --git a/services/platform/convex/workflow_engine/action_defs/website/helpers/types.ts b/services/platform/convex/workflow_engine/action_defs/website/helpers/types.ts index 651492890d..2ad1c9033b 100644 --- a/services/platform/convex/workflow_engine/action_defs/website/helpers/types.ts +++ b/services/platform/convex/workflow_engine/action_defs/website/helpers/types.ts @@ -10,7 +10,7 @@ export type WebsiteActionParams = description?: string; scanInterval?: string; lastScannedAt?: number; - status?: 'active' | 'inactive' | 'error'; + status?: 'idle' | 'scanning' | 'active' | 'error'; metadata?: ConvexJsonRecord; } | { @@ -21,7 +21,7 @@ export type WebsiteActionParams = description?: string; scanInterval?: string; lastScannedAt?: number; - status?: 'active' | 'inactive' | 'error'; + status?: 'idle' | 'scanning' | 'active' | 'error'; metadata?: ConvexJsonRecord; } | { diff --git a/services/platform/convex/workflow_engine/action_defs/website/website_action.ts b/services/platform/convex/workflow_engine/action_defs/website/website_action.ts index e448730b86..92ed808e3a 100644 --- a/services/platform/convex/workflow_engine/action_defs/website/website_action.ts +++ b/services/platform/convex/workflow_engine/action_defs/website/website_action.ts @@ -8,7 +8,12 @@ import { internal } from '../../../_generated/api'; // Common field validators const statusValidator = v.optional( - v.union(v.literal('active'), v.literal('inactive'), v.literal('error')), + v.union( + v.literal('idle'), + v.literal('scanning'), + v.literal('active'), + v.literal('error'), + ), ); export const websiteAction: ActionDefinition = { diff --git a/services/platform/convex/workflow_engine/action_defs/website_pages/helpers/types.ts b/services/platform/convex/workflow_engine/action_defs/website_pages/helpers/types.ts deleted file mode 100644 index 39d5abd987..0000000000 --- a/services/platform/convex/workflow_engine/action_defs/website_pages/helpers/types.ts +++ /dev/null @@ -1,47 +0,0 @@ -/** - * Website Pages action types - */ - -import type { Id } from '../../../../_generated/dataModel'; - -// Page data structure -// Note: We use camelCase consistently as per Convex conventions. -// The snake_case aliases (word_count, structured_data) are kept for -// backwards compatibility with existing workflows that may use them. -export interface PageData { - url: string; - title?: string; - description?: string; - content?: string; - wordCount?: number; - /** @deprecated Use wordCount instead */ - word_count?: number; - metadata?: unknown; - structuredData?: unknown; - /** @deprecated Use structuredData instead */ - structured_data?: unknown; -} - -// Discriminated union for website pages operations -export type WebsitePagesActionParams = - | { - operation: 'bulk_upsert'; - websiteId: Id<'websites'>; - pages: PageData[]; - } - | { - operation: 'register_urls'; - websiteId: Id<'websites'>; - urls: Array<{ - url: string; - contentHash?: string | null; - status?: string; - }>; - } - | { - operation: 'crawl_and_upsert'; - websiteId: Id<'websites'>; - urls: string[]; - wordCountThreshold?: number; - crawlerTimeoutMs?: number; - }; diff --git a/services/platform/convex/workflow_engine/action_defs/website_pages/website_pages_action.ts b/services/platform/convex/workflow_engine/action_defs/website_pages/website_pages_action.ts deleted file mode 100644 index 78b33c7708..0000000000 --- a/services/platform/convex/workflow_engine/action_defs/website_pages/website_pages_action.ts +++ /dev/null @@ -1,339 +0,0 @@ -import { v } from 'convex/values'; - -import type { ActionCtx } from '../../../_generated/server'; -import type { ActionDefinition } from '../../helpers/nodes/action/types'; -import type { WebsitePagesActionParams } from './helpers/types'; - -import { jsonRecordValidator } from '../../../../lib/shared/schemas/utils/json-value'; -import { isRecord, getRecord } from '../../../../lib/utils/type-guards'; -import { internal } from '../../../_generated/api'; -import { createDebugLog } from '../../../lib/debug_log'; -import { toConvexJsonValue } from '../../../lib/type_cast_helpers'; - -const debugLog = createDebugLog('DEBUG_WEBSITE_PAGES', '[WebsitePages]'); - -const BATCH_SIZE = 100; - -const pageValidator = v.object({ - url: v.string(), - title: v.optional(v.string()), - description: v.optional(v.string()), - content: v.optional(v.string()), - wordCount: v.optional(v.number()), - word_count: v.optional(v.number()), - metadata: v.optional(jsonRecordValidator), - structuredData: v.optional(jsonRecordValidator), - structured_data: v.optional(jsonRecordValidator), -}); - -export const websitePagesAction: ActionDefinition = { - type: 'websitePages', - title: 'Website Pages', - description: - 'Manage website pages (bulk upsert, register URLs, crawl and upsert). organizationId is automatically read from workflow context variables.', - - parametersValidator: v.union( - v.object({ - operation: v.literal('bulk_upsert'), - websiteId: v.id('websites'), - pages: v.array(pageValidator), - }), - v.object({ - operation: v.literal('register_urls'), - websiteId: v.id('websites'), - urls: v.array( - v.object({ - url: v.string(), - contentHash: v.optional(v.union(v.string(), v.null())), - status: v.optional(v.string()), - }), - ), - }), - v.object({ - operation: v.literal('crawl_and_upsert'), - websiteId: v.id('websites'), - urls: v.array(v.string()), - wordCountThreshold: v.optional(v.number()), - crawlerTimeoutMs: v.optional(v.number()), - }), - ), - - async execute(ctx, params, variables) { - switch (params.operation) { - case 'bulk_upsert': - return await executeBulkUpsert(ctx, params, variables); - case 'register_urls': - return await executeRegisterUrls(ctx, params, variables); - case 'crawl_and_upsert': - return await executeCrawlAndUpsert(ctx, params, variables); - default: - throw new Error( - `Unknown websitePages operation: ${(params as { operation: string }).operation}`, - ); - } - }, -}; - -type BulkUpsertParams = Extract< - WebsitePagesActionParams, - { operation: 'bulk_upsert' } ->; - -type RegisterUrlsParams = Extract< - WebsitePagesActionParams, - { operation: 'register_urls' } ->; - -type CrawlAndUpsertParams = Extract< - WebsitePagesActionParams, - { operation: 'crawl_and_upsert' } ->; - -function getOrganizationId( - variables: Record, - operation: string, -) { - const organizationId = - typeof variables.organizationId === 'string' - ? variables.organizationId - : undefined; - - if (!organizationId) { - throw new Error( - `${operation} operation requires organizationId in context`, - ); - } - - return organizationId; -} - -async function executeBulkUpsert( - ctx: ActionCtx, - params: BulkUpsertParams, - variables: Record, -) { - const organizationId = getOrganizationId(variables, 'bulk_upsert'); - - const normalizedPages = params.pages.map((p) => { - const sd = p.structuredData ?? p.structured_data; - return { - url: p.url, - title: p.title ?? undefined, - content: p.content ?? undefined, - wordCount: p.wordCount ?? p.word_count ?? undefined, - metadata: p.metadata ? toConvexJsonValue(p.metadata) : undefined, - structuredData: sd ? toConvexJsonValue(sd) : undefined, - }; - }); - - const result = await ctx.runMutation( - internal.websites.internal_mutations.bulkUpsertPages, - { - organizationId, - websiteId: params.websiteId, - pages: normalizedPages, - }, - ); - - return { - operation: 'bulk_upsert' as const, - created: result.created, - updated: result.updated, - total: result.total, - success: true, - timestamp: Date.now(), - }; -} - -async function executeRegisterUrls( - ctx: ActionCtx, - params: RegisterUrlsParams, - variables: Record, -) { - const organizationId = getOrganizationId(variables, 'register_urls'); - - let totalRegistered = 0; - let totalUpdated = 0; - let totalDeleted = 0; - let totalSkipped = 0; - const allUrlsToSync: string[] = []; - - // Normalize entries: map null contentHash to undefined for Convex - const entries = params.urls.map((entry) => ({ - url: entry.url, - contentHash: entry.contentHash ?? undefined, - status: entry.status, - })); - - for (let i = 0; i < entries.length; i += BATCH_SIZE) { - const batch = entries.slice(i, i + BATCH_SIZE); - - debugLog( - `Registering URL batch ${Math.floor(i / BATCH_SIZE) + 1} (${batch.length} URLs)`, - ); - - const result = await ctx.runMutation( - internal.websites.internal_mutations.registerUrls, - { - organizationId, - websiteId: params.websiteId, - urls: batch, - }, - ); - - totalRegistered += result.registered; - totalUpdated += result.updated; - totalDeleted += result.deleted; - totalSkipped += result.skipped; - allUrlsToSync.push(...result.urlsToSync); - } - - debugLog( - `Registered ${totalRegistered}, updated ${totalUpdated}, deleted ${totalDeleted}, skipped ${totalSkipped}`, - ); - - return { - operation: 'register_urls' as const, - registered: totalRegistered, - updated: totalUpdated, - deleted: totalDeleted, - skipped: totalSkipped, - total: params.urls.length, - urlsToSync: allUrlsToSync, - success: true, - timestamp: Date.now(), - }; -} - -interface CrawlerFetchResponse { - success: boolean; - urls_requested: number; - urls_fetched: number; - pages: Array<{ - url: string; - title?: string; - content: string; - word_count: number; - metadata?: Record; - structured_data?: Record; - }>; - failed?: Array<{ url: string; status_code: number | null; error: string }>; -} - -async function executeCrawlAndUpsert( - ctx: ActionCtx, - params: CrawlAndUpsertParams, - variables: Record, -) { - const organizationId = getOrganizationId(variables, 'crawl_and_upsert'); - - debugLog('crawl_and_upsert params.urls:', { - hasUrls: params.urls !== undefined, - urlsType: typeof params.urls, - isArray: Array.isArray(params.urls), - urlsLength: Array.isArray(params.urls) ? params.urls.length : 'N/A', - registerUrlsStep: (() => { - const steps = isRecord(variables.steps) ? variables.steps : undefined; - return steps?.register_urls ? 'exists' : 'missing'; - })(), - registerUrlsOutput: (() => { - const steps = isRecord(variables.steps) ? variables.steps : undefined; - const step = steps ? getRecord(steps, 'register_urls') : undefined; - const output = step ? getRecord(step, 'output') : undefined; - const data = output ? getRecord(output, 'data') : undefined; - return { - hasOutput: !!output, - hasData: !!data, - hasUrlsToSync: data?.urlsToSync !== undefined, - urlsToSyncType: typeof data?.urlsToSync, - urlsToSyncIsArray: Array.isArray(data?.urlsToSync), - }; - })(), - }); - - const urls = params.urls ?? []; - - if (urls.length === 0) { - debugLog('No URLs to sync'); - return { - operation: 'crawl_and_upsert' as const, - processed: 0, - failed: 0, - total: 0, - success: true, - timestamp: Date.now(), - }; - } - - const wordCountThreshold = params.wordCountThreshold ?? 100; - const crawlerTimeout = params.crawlerTimeoutMs ?? 300000; - const serviceUrl = process.env.CRAWLER_URL || 'http://localhost:8002'; - - debugLog(`Fetching ${urls.length} pages via crawler`); - - const controller = new AbortController(); - const timeoutId = setTimeout(() => controller.abort(), crawlerTimeout); - - const response = await fetch(`${serviceUrl}/api/v1/urls/fetch`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - urls, - word_count_threshold: wordCountThreshold, - }), - signal: controller.signal, - }); - - clearTimeout(timeoutId); - - if (!response.ok) { - const errorText = await response.text(); - throw new Error(`Crawler service error (${response.status}): ${errorText}`); - } - - const fetchResult: CrawlerFetchResponse = await response.json(); - - if (!fetchResult.success) { - const errorMessage = - // oxlint-disable-next-line typescript/no-unsafe-type-assertion -- dynamic data - (fetchResult as unknown as { error?: string }).error || 'Unknown error'; - throw new Error(`URL fetch failed: ${errorMessage}`); - } - - const failedCount = fetchResult.failed?.length ?? 0; - - debugLog( - `Crawler returned ${fetchResult.pages.length} pages, ${failedCount} failures`, - ); - - if (fetchResult.pages.length > 0) { - const normalizedPages = fetchResult.pages.map((p) => ({ - url: p.url, - title: p.title ?? undefined, - content: p.content, - wordCount: p.word_count, - metadata: p.metadata ? toConvexJsonValue(p.metadata) : undefined, - structuredData: p.structured_data - ? toConvexJsonValue(p.structured_data) - : undefined, - })); - - await ctx.runMutation( - internal.websites.internal_mutations.bulkUpsertPages, - { - organizationId, - websiteId: params.websiteId, - pages: normalizedPages, - }, - ); - } - - return { - operation: 'crawl_and_upsert' as const, - processed: fetchResult.pages.length, - failed: failedCount, - total: urls.length, - success: true, - timestamp: Date.now(), - }; -} diff --git a/services/platform/convex/workflow_engine/action_defs/workflow_processing_records/helpers/types.ts b/services/platform/convex/workflow_engine/action_defs/workflow_processing_records/helpers/types.ts index 86ff05cc4e..7aff6a5b07 100644 --- a/services/platform/convex/workflow_engine/action_defs/workflow_processing_records/helpers/types.ts +++ b/services/platform/convex/workflow_engine/action_defs/workflow_processing_records/helpers/types.ts @@ -6,8 +6,7 @@ export type TableName = | 'documents' | 'conversations' | 'approvals' - | 'onedriveSyncConfigs' - | 'websitePages'; + | 'onedriveSyncConfigs'; // The model layer returns { documents: T[], count } but actions extract just the first document // because execute_action_node wraps the result in output: { type: 'action', data: result } diff --git a/services/platform/convex/workflow_engine/action_defs/workflow_processing_records/workflow_processing_records_action.ts b/services/platform/convex/workflow_engine/action_defs/workflow_processing_records/workflow_processing_records_action.ts index 7ed440449b..1abf3cb431 100644 --- a/services/platform/convex/workflow_engine/action_defs/workflow_processing_records/workflow_processing_records_action.ts +++ b/services/platform/convex/workflow_engine/action_defs/workflow_processing_records/workflow_processing_records_action.ts @@ -32,7 +32,6 @@ const tableNameValidator = v.union( v.literal('conversationMessages'), v.literal('approvals'), v.literal('onedriveSyncConfigs'), - v.literal('websitePages'), ); // Type for workflow processing records operation params (discriminated union) diff --git a/services/platform/convex/workflow_engine/helpers/validation/variables/action_schemas.ts b/services/platform/convex/workflow_engine/helpers/validation/variables/action_schemas.ts index 018b9be163..5a0549f03b 100644 --- a/services/platform/convex/workflow_engine/helpers/validation/variables/action_schemas.ts +++ b/services/platform/convex/workflow_engine/helpers/validation/variables/action_schemas.ts @@ -273,7 +273,7 @@ const processingRecordFields = createDocFields('workflowProcessingRecords', { const workflowProcessingRecordsSchemas: Record = { find_unprocessed: { // Dynamic output - returns full document from the specified table - // Can be customers, products, documents, websitePages, etc. + // Can be customers, products, documents, etc. // We can't statically validate fields since it depends on tableName parameter description: 'Full unprocessed record from the specified table, or null', nullable: true, @@ -589,58 +589,6 @@ const websiteSchemas: Record = { }, }; -// ============================================================================= -// WEBSITE PAGES ACTION SCHEMAS -// ============================================================================= - -const websitePagesSchemas: Record = { - bulk_upsert: { - description: 'Bulk upsert result', - fields: { - created: { type: 'number', description: 'Number of pages created' }, - updated: { type: 'number', description: 'Number of pages updated' }, - total: { type: 'number', description: 'Total pages processed' }, - }, - }, - register_urls: { - description: 'Register URLs result', - fields: { - registered: { - type: 'number', - description: 'Number of new URLs registered', - }, - updated: { - type: 'number', - description: 'Number of URLs with changed content hash', - }, - deleted: { type: 'number', description: 'Number of URLs deleted' }, - skipped: { - type: 'number', - description: 'Number of unchanged URLs skipped', - }, - total: { type: 'number', description: 'Total URLs processed' }, - urlsToSync: { - type: 'array', - description: 'URLs that need content fetching', - }, - }, - }, - crawl_and_upsert: { - description: 'Crawl and upsert result', - fields: { - processed: { - type: 'number', - description: 'Number of pages successfully crawled', - }, - failed: { - type: 'number', - description: 'Number of pages that failed to crawl', - }, - total: { type: 'number', description: 'Total URLs requested' }, - }, - }, -}; - // ============================================================================= // WORKFLOW ACTION SCHEMAS // ============================================================================= @@ -689,7 +637,6 @@ const workflowSchemas: Record = { * - onedrive: OneDrive file operations * - crawler: Website crawling * - website: Website management - * - websitePages: Website page management * - workflow: Workflow utility operations */ export const actionOutputSchemaRegistry: ActionOutputSchemaRegistry = { @@ -705,7 +652,6 @@ export const actionOutputSchemaRegistry: ActionOutputSchemaRegistry = { onedrive: onedriveSchemas, crawler: crawlerSchemas, website: websiteSchemas, - websitePages: websitePagesSchemas, workflow: workflowSchemas, }; diff --git a/services/platform/convex/workflow_engine/workflow_syntax_compact.ts b/services/platform/convex/workflow_engine/workflow_syntax_compact.ts index d32dcb7454..bcdbb8caa6 100644 --- a/services/platform/convex/workflow_engine/workflow_syntax_compact.ts +++ b/services/platform/convex/workflow_engine/workflow_syntax_compact.ts @@ -141,15 +141,6 @@ Params (update): websiteId (required), domain?, title?, description?, scanInterv Params (get_by_domain): domain (required) Output: \`{ data: {...website} | null }\` -### websitePages -Ops: bulk_upsert, register_urls, crawl_and_upsert -Params (bulk_upsert): websiteId (required), pages (required - array of { url, title?, description?, content?, wordCount?, metadata?, structuredData? }) -Output (bulk_upsert): \`{ data: { created, updated, total } }\` -Params (register_urls): websiteId (required), urls (required - array of { url, contentHash?, status? }) -Output (register_urls): \`{ data: { registered, updated, deleted, skipped, total, urlsToSync } }\` -Params (crawl_and_upsert): websiteId (required), urls (required - array of URL strings), wordCountThreshold?, crawlerTimeoutMs? -Output (crawl_and_upsert): \`{ data: { processed, failed, total } }\` - ### workflow Ops: upload_all_workflows Params: timeout? (default: 120000ms) diff --git a/services/platform/convex/workflows/processing_records/get_table_indexes.ts b/services/platform/convex/workflows/processing_records/get_table_indexes.ts index 4c582c7c07..b6d5fff6c3 100644 --- a/services/platform/convex/workflows/processing_records/get_table_indexes.ts +++ b/services/platform/convex/workflows/processing_records/get_table_indexes.ts @@ -114,23 +114,6 @@ export const TABLE_INDEXES: Record = { { name: 'by_organizationId', fields: ['organizationId'] }, ], - websitePages: [ - { - name: 'by_websiteId_and_syncStatus', - fields: ['websiteId', 'syncStatus'], - }, - { - name: 'by_websiteId_and_lastCrawledAt', - fields: ['websiteId', 'lastCrawledAt'], - }, - { - name: 'by_organizationId_and_url', - fields: ['organizationId', 'url'], - }, - { name: 'by_websiteId', fields: ['websiteId'] }, - { name: 'by_organizationId', fields: ['organizationId'] }, - ], - conversationMessages: [ // by_organizationId_and_direction is optimal for filtering inbound messages { diff --git a/services/platform/convex/workflows/processing_records/internal_mutations.ts b/services/platform/convex/workflows/processing_records/internal_mutations.ts index a443ff5a86..f24994b44f 100644 --- a/services/platform/convex/workflows/processing_records/internal_mutations.ts +++ b/services/platform/convex/workflows/processing_records/internal_mutations.ts @@ -13,7 +13,6 @@ const tableNameValidator = v.union( v.literal('conversationMessages'), v.literal('approvals'), v.literal('onedriveSyncConfigs'), - v.literal('websitePages'), ); export const findUnprocessed = internalMutation({ diff --git a/services/platform/convex/workflows/processing_records/types.ts b/services/platform/convex/workflows/processing_records/types.ts index df312bd619..fd20ea4c04 100644 --- a/services/platform/convex/workflows/processing_records/types.ts +++ b/services/platform/convex/workflows/processing_records/types.ts @@ -12,8 +12,7 @@ export type TableName = | 'conversations' | 'conversationMessages' | 'approvals' - | 'onedriveSyncConfigs' - | 'websitePages'; + | 'onedriveSyncConfigs'; /** * Arguments for finding and claiming a single unprocessed document with custom query diff --git a/services/platform/lib/shared/schemas/websites.ts b/services/platform/lib/shared/schemas/websites.ts index 9fc589ecd0..451ac92cde 100644 --- a/services/platform/lib/shared/schemas/websites.ts +++ b/services/platform/lib/shared/schemas/websites.ts @@ -2,7 +2,7 @@ import { z } from 'zod/v4'; import { jsonRecordSchema } from './utils/json-value'; -const websiteStatusLiterals = ['active', 'inactive', 'error'] as const; +const websiteStatusLiterals = ['idle', 'scanning', 'active', 'error'] as const; export const websiteStatusSchema = z.enum(websiteStatusLiterals); type WebsiteStatus = z.infer; @@ -21,20 +21,3 @@ export const websiteSchema = z.object({ }); type Website = z.infer; - -export const websitePageSchema = z.object({ - _id: z.string(), - _creationTime: z.number(), - organizationId: z.string(), - websiteId: z.string(), - url: z.string(), - title: z.string().optional(), - content: z.string().optional(), - wordCount: z.number().optional(), - lastCrawledAt: z.number(), - metadata: jsonRecordSchema.optional(), - structuredData: jsonRecordSchema.optional(), - syncStatus: z.enum(['pending', 'synced']).optional(), -}); - -type WebsitePage = z.infer; diff --git a/services/platform/messages/en.json b/services/platform/messages/en.json index 0523dd0bd3..34817ac0fa 100644 --- a/services/platform/messages/en.json +++ b/services/platform/messages/en.json @@ -2742,14 +2742,15 @@ "title": "Website pages", "noPages": "No pages crawled yet", "loadMore": "Load more", - "noContent": "No content", - "showMore": "Show more", - "showLess": "Show less" + "wordCount": "{count} words", + "chunks": "{count} chunks", + "lastCrawled": "Crawled {date}" }, "filter": { "status": { - "active": "Active", + "idle": "Idle", "scanning": "Scanning", + "active": "Active", "error": "Error" } } From d67b7e63d0bd0eea6d4a79e0d1715922ca19fddd Mon Sep 17 00:00:00 2001 From: larryro <371767072@qq.com> Date: Fri, 27 Feb 2026 22:46:29 +0800 Subject: [PATCH 2/9] feat: add search UI, chunk inspection, and async website registration Add semantic search and chunk viewer to the website pages dialog. Make website registration non-blocking with background homepage crawl and scheduled metadata sync. Track crawled vs total page counts separately. Remove the standalone rescan action in favor of scheduler-driven scans. Improve reliability with exponential backoff retries, HNSW index auto-creation, scan cancellation, domain normalization, parameterized sort columns, and proper URL encoding. --- services/crawler/app/models.py | 29 +- services/crawler/app/routers/index.py | 6 +- services/crawler/app/routers/pages.py | 53 ++- services/crawler/app/routers/websites.py | 90 ++++- .../crawler/app/services/chunking_service.py | 8 +- .../crawler/app/services/embedding_service.py | 36 +- .../crawler/app/services/indexing_service.py | 12 +- .../crawler/app/services/pg_website_store.py | 40 ++- services/crawler/app/services/scheduler.py | 47 ++- services/crawler/app/utils/metadata.py | 14 + .../crawler/tests/test_chunking_service.py | 33 ++ .../crawler/tests/test_embedding_service.py | 15 +- .../crawler/tests/test_indexing_service.py | 11 +- services/crawler/tests/test_pages_router.py | 60 ++++ .../crawler/tests/test_websites_router.py | 89 ++++- .../components/website-edit-dialog.tsx | 16 +- .../components/website-pages-cell.tsx | 5 +- .../components/website-pages-dialog.tsx | 318 ++++++++++++++---- .../components/website-row-actions.tsx | 39 +-- .../app/features/websites/hooks/mutations.ts | 7 +- services/platform/convex/_generated/api.d.ts | 2 - services/platform/convex/websites/actions.ts | 146 +++++--- services/platform/convex/websites/helpers.ts | 1 - .../convex/websites/internal_actions.ts | 143 +++++++- .../convex/websites/internal_mutations.ts | 14 +- .../convex/websites/rescan_website.ts | 52 --- services/platform/convex/websites/schema.ts | 1 + services/platform/convex/websites/types.ts | 39 +++ .../convex/websites/update_website.ts | 2 + .../platform/convex/websites/validators.ts | 1 + .../platform/lib/shared/schemas/websites.ts | 1 + services/platform/messages/en.json | 14 +- 32 files changed, 1017 insertions(+), 327 deletions(-) create mode 100644 services/crawler/app/utils/metadata.py delete mode 100644 services/platform/convex/websites/rescan_website.ts diff --git a/services/crawler/app/models.py b/services/crawler/app/models.py index bb9f7cad50..11bc1f2562 100644 --- a/services/crawler/app/models.py +++ b/services/crawler/app/models.py @@ -4,7 +4,9 @@ from typing import Any, Literal -from pydantic import BaseModel, Field, HttpUrl +from urllib.parse import urlparse + +from pydantic import BaseModel, Field, HttpUrl, field_validator # Valid Playwright wait_until values WaitUntilType = Literal["load", "domcontentloaded", "networkidle", "commit"] @@ -27,6 +29,14 @@ class RegisterWebsiteRequest(BaseModel): domain: str = Field(..., description="The domain to register (e.g., 'docs.example.com')") scan_interval: int = Field(21600, description="Scan interval in seconds (default: 6h)", ge=60) + @field_validator("domain") + @classmethod + def normalize_domain(cls, v: str) -> str: + """Strip protocol/path — store bare hostname only.""" + if "://" in v: + return urlparse(v).hostname or v + return v + class WebsiteInfoResponse(BaseModel): """Full website information.""" @@ -35,6 +45,7 @@ class WebsiteInfoResponse(BaseModel): title: str | None = None description: str | None = None page_count: int = 0 + crawled_count: int = 0 status: str = "idle" scan_interval: int = 21600 last_scanned_at: str | None = None @@ -356,6 +367,22 @@ class PageListResponse(BaseModel): has_more: bool = False +class PageChunkItem(BaseModel): + """A single chunk from a page.""" + + chunk_index: int + chunk_content: str + + +class PageChunksResponse(BaseModel): + """Response containing all chunks for a specific page.""" + + url: str + domain: str + chunks: list[PageChunkItem] = Field(default_factory=list) + total: int = 0 + + # ==================== Indexing Models ==================== diff --git a/services/crawler/app/routers/index.py b/services/crawler/app/routers/index.py index 022af6dbf1..46d849a4e8 100644 --- a/services/crawler/app/routers/index.py +++ b/services/crawler/app/routers/index.py @@ -51,7 +51,11 @@ async def index_page(request: IndexPageRequest): @router.post("/website/{domain}", response_model=IndexWebsiteResponse) async def index_website(domain: str): - """Re-index all pages for a website.""" + """Re-index all pages for a website. + + Website status updates are handled by the scheduler during automated scans. + This endpoint is for manual/on-demand re-indexing only. + """ try: service = _get_indexing_service() result = await service.index_website(domain) diff --git a/services/crawler/app/routers/pages.py b/services/crawler/app/routers/pages.py index c12df87461..6367ba4d16 100644 --- a/services/crawler/app/routers/pages.py +++ b/services/crawler/app/routers/pages.py @@ -5,7 +5,7 @@ from fastapi import APIRouter, HTTPException, Query from loguru import logger -from app.models import PageListItem, PageListResponse +from app.models import PageChunkItem, PageChunksResponse, PageListItem, PageListResponse from app.services.database import get_pool router = APIRouter(prefix="/api/v1/pages", tags=["Pages"]) @@ -23,9 +23,12 @@ async def list_pages( try: pool = get_pool() - valid_sorts = {"last_crawled_at", "discovered_at", "word_count"} - sort_field = sort if sort in valid_sorts else "last_crawled_at" - order = "DESC" + sort_columns = { + "last_crawled_at": "wu.last_crawled_at", + "discovered_at": "wu.discovered_at", + "word_count": "wu.word_count", + } + sort_col = sort_columns.get(sort, "wu.last_crawled_at") async with pool.acquire() as conn: # Build query with optional status filter @@ -53,7 +56,7 @@ async def list_pages( GROUP BY url ) c ON c.url = wu.url WHERE {where_clause} - ORDER BY wu.{sort_field} {order} NULLS LAST + ORDER BY {sort_col} DESC NULLS LAST LIMIT ${param_idx} OFFSET ${param_idx + 1}""", *params, ) @@ -91,3 +94,43 @@ async def list_pages( except Exception: logger.exception(f"Error listing pages for {domain}") raise HTTPException(status_code=500, detail="Failed to list pages") from None + + +@router.get("/{domain}/chunks", response_model=PageChunksResponse) +async def get_page_chunks( + domain: str, + url: str = Query(..., description="The page URL to get chunks for"), +): + """Get all indexed chunks for a specific page URL.""" + try: + pool = get_pool() + + async with pool.acquire() as conn: + rows = await conn.fetch( + """SELECT chunk_index, chunk_content + FROM chunks + WHERE domain = $1 AND url = $2 + ORDER BY chunk_index ASC""", + domain, + url, + ) + + chunks = [ + PageChunkItem( + chunk_index=r["chunk_index"], + chunk_content=r["chunk_content"], + ) + for r in rows + ] + + return PageChunksResponse( + url=url, + domain=domain, + chunks=chunks, + total=len(chunks), + ) + except HTTPException: + raise + except Exception: + logger.exception(f"Error getting chunks for {url} in {domain}") + raise HTTPException(status_code=500, detail="Failed to get page chunks") from None diff --git a/services/crawler/app/routers/websites.py b/services/crawler/app/routers/websites.py index 7db7a1814a..76087b6803 100644 --- a/services/crawler/app/routers/websites.py +++ b/services/crawler/app/routers/websites.py @@ -2,14 +2,19 @@ Websites Router — Website registration and URL listing endpoints. """ -from datetime import datetime +import asyncio +import hashlib +import json +from datetime import UTC, datetime from fastapi import APIRouter, HTTPException, Query, Request from loguru import logger from app.models import RegisterWebsiteRequest, WebsiteInfoResponse, WebsiteUrl, WebsiteUrlsResponse +from app.services.crawler_service import get_crawler_service from app.services.pg_website_store import PgWebsiteStoreManager -from app.services.scheduler import trigger_scan +from app.services.scheduler import cancel_scan, trigger_scan +from app.utils.metadata import extract_meta_description router = APIRouter(prefix="/api/v1/websites", tags=["Websites"]) @@ -24,20 +29,89 @@ def _format_timestamp(val) -> str | None: if isinstance(val, datetime): return val.isoformat() if isinstance(val, (int, float)): - return datetime.fromtimestamp(val).isoformat() + return datetime.fromtimestamp(val, tz=UTC).isoformat() return str(val) -@router.post("") +async def _initialize_website(domain: str, manager: PgWebsiteStoreManager): + """Background task: crawl homepage + discover URLs concurrently.""" + crawler_service = get_crawler_service() + if not crawler_service.initialized: + await crawler_service.initialize() + + site_store = manager.get_site_store(domain) + + async def _crawl_homepage(): + homepage_url = f"https://{domain}/" + try: + results = await crawler_service.crawl_urls(urls=[homepage_url]) + if not results: + return + page = results[0] + title = page.get("title") + sd = page.get("structured_data") + if isinstance(sd, str): + sd = json.loads(sd) + description = extract_meta_description(sd) + + await site_store.save_discovered_urls([{"url": homepage_url}]) + await site_store.update_content_hashes( + [ + { + "url": homepage_url, + "content_hash": hashlib.sha256(page["content"].encode()).hexdigest(), + "status": "active", + "title": title, + "content": page["content"], + "word_count": page.get("word_count", 0), + "metadata": page.get("metadata"), + "structured_data": sd, + } + ] + ) + await manager.update_website_metadata( + domain=domain, + title=title, + description=description, + page_count=1, + ) + except Exception: + logger.exception(f"Failed to crawl homepage for {domain}") + + async def _discover_urls(): + try: + discovered = await crawler_service.discover_urls(domain=domain, max_urls=-1) + if discovered: + await site_store.save_discovered_urls(discovered) + logger.info(f"Discovered {len(discovered)} URLs for {domain}") + except Exception: + logger.exception(f"URL discovery failed for {domain}") + + await asyncio.gather(_crawl_homepage(), _discover_urls()) + + await manager.update_last_scanned(domain) + await manager.update_scan_status(domain, "active") + + +@router.post("", response_model=WebsiteInfoResponse) async def register_website(request: RegisterWebsiteRequest, http_request: Request): try: manager = _get_manager(http_request) - result = await manager.register_website( + await manager.register_website( domain=request.domain, scan_interval=request.scan_interval, ) + + # Fire-and-forget: crawl homepage + discover URLs concurrently in background + task = asyncio.create_task(_initialize_website(request.domain, manager)) + task.add_done_callback(lambda t: t.exception() if not t.cancelled() else None) trigger_scan() - return result + + return WebsiteInfoResponse( + domain=request.domain, + status="scanning", + scan_interval=request.scan_interval, + ) except Exception: logger.exception("Error registering website") raise HTTPException(status_code=500, detail="Failed to register website") from None @@ -56,7 +130,8 @@ async def get_website_info(domain: str, http_request: Request): domain=website["domain"], title=website.get("title"), description=website.get("description"), - page_count=website.get("page_count", 0), + page_count=website.get("total_urls", 0), + crawled_count=website.get("crawled_count", 0), status=website.get("status", "idle"), scan_interval=website.get("scan_interval", 21600), last_scanned_at=_format_timestamp(website.get("last_scanned_at")), @@ -74,6 +149,7 @@ async def get_website_info(domain: str, http_request: Request): @router.delete("/{domain}") async def deregister_website(domain: str, http_request: Request): try: + cancel_scan(domain) manager = _get_manager(http_request) deleted = await manager.remove_website(domain) if not deleted: diff --git a/services/crawler/app/services/chunking_service.py b/services/crawler/app/services/chunking_service.py index 385324bda5..0eba4b7305 100644 --- a/services/crawler/app/services/chunking_service.py +++ b/services/crawler/app/services/chunking_service.py @@ -22,6 +22,7 @@ class ContentChunk: def chunk_content( content: str, title: str | None = None, + url: str | None = None, chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP, min_chunk_length: int = MIN_CHUNK_LENGTH, @@ -30,7 +31,12 @@ def chunk_content( return [] text = content.strip() - prefix = f"{title.strip()}\n\n" if title and title.strip() else "" + parts: list[str] = [] + if title and title.strip(): + parts.append(title.strip()) + if url and url.strip(): + parts.append(url.strip()) + prefix = "\n\n".join(parts) + "\n\n" if parts else "" # Split into paragraphs first paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()] diff --git a/services/crawler/app/services/embedding_service.py b/services/crawler/app/services/embedding_service.py index ea1cc60839..debeff3e10 100644 --- a/services/crawler/app/services/embedding_service.py +++ b/services/crawler/app/services/embedding_service.py @@ -13,7 +13,8 @@ MAX_BATCH_SIZE = 2048 MAX_CONCURRENT_REQUESTS = 3 -RETRY_DELAY_SECONDS = 1.0 +MAX_RETRIES = 3 +RETRY_BASE_DELAY = 1.0 class EmbeddingService: @@ -29,22 +30,23 @@ def dimensions(self) -> int: async def _embed_batch(self, batch: list[str]) -> list[list[float]]: async with self._semaphore: - try: - response = await self._client.embeddings.create( - model=self._model, - input=batch, - dimensions=self._dimensions, - ) - return [item.embedding for item in response.data] - except Exception: - logger.warning(f"Embedding request failed, retrying in {RETRY_DELAY_SECONDS}s") - await asyncio.sleep(RETRY_DELAY_SECONDS) - response = await self._client.embeddings.create( - model=self._model, - input=batch, - dimensions=self._dimensions, - ) - return [item.embedding for item in response.data] + for attempt in range(MAX_RETRIES): + try: + response = await self._client.embeddings.create( + model=self._model, + input=batch, + dimensions=self._dimensions, + ) + return [item.embedding for item in response.data] + except Exception: + if attempt == MAX_RETRIES - 1: + raise + delay = RETRY_BASE_DELAY * (2**attempt) + logger.warning( + f"Embedding request failed (attempt {attempt + 1}/{MAX_RETRIES}), retrying in {delay}s" + ) + await asyncio.sleep(delay) + raise RuntimeError("unreachable") async def embed_texts(self, texts: list[str]) -> list[list[float]]: if not texts: diff --git a/services/crawler/app/services/indexing_service.py b/services/crawler/app/services/indexing_service.py index 44f9e58164..2cc2ca43cc 100644 --- a/services/crawler/app/services/indexing_service.py +++ b/services/crawler/app/services/indexing_service.py @@ -24,6 +24,7 @@ class IndexingService: def __init__(self, pool: asyncpg.Pool, embedding_service: EmbeddingService): self._pool = pool self._embedding = embedding_service + self._hnsw_ensured = False async def index_page(self, domain: str, url: str, title: str | None, content: str) -> dict: content_hash = _sha256(content) @@ -35,7 +36,7 @@ async def index_page(self, domain: str, url: str, title: str | None, content: st return {"url": url, "status": "skipped", "chunks_indexed": 0} # Chunk content - chunks = chunk_content(content, title=title) + chunks = chunk_content(content, title=title, url=url) if not chunks: return {"url": url, "status": "empty", "chunks_indexed": 0} @@ -71,6 +72,15 @@ async def index_page(self, domain: str, url: str, title: str | None, content: st ], ) + # Ensure HNSW index exists once embeddings are stored + if not self._hnsw_ensured: + try: + async with self._pool.acquire() as conn: + await conn.execute("SELECT create_chunks_hnsw_index()") + self._hnsw_ensured = True + except Exception: + pass + logger.info(f"Indexed {len(chunks)} chunks for {url}") return {"url": url, "status": "indexed", "chunks_indexed": len(chunks)} diff --git a/services/crawler/app/services/pg_website_store.py b/services/crawler/app/services/pg_website_store.py index 1d01ecac12..854801a726 100644 --- a/services/crawler/app/services/pg_website_store.py +++ b/services/crawler/app/services/pg_website_store.py @@ -23,19 +23,22 @@ def __init__(self, pool: asyncpg.Pool, domain: str): self._domain = domain async def save_discovered_urls(self, urls: list[dict]) -> int: + """Save discovered URLs. Returns number of newly inserted URLs (excludes duplicates).""" if not urls: return 0 async with self._pool.acquire() as conn: + count_before = await conn.fetchval("SELECT COUNT(*) FROM website_urls WHERE domain = $1", self._domain) await conn.executemany( """INSERT INTO website_urls (domain, url, discovered_at) VALUES ($1, $2, NOW()) ON CONFLICT (domain, url) DO NOTHING""", [(self._domain, u["url"]) for u in urls], ) - count = await conn.fetchval("SELECT COUNT(*) FROM website_urls WHERE domain = $1", self._domain) - logger.info(f"Saved discovered URLs for {self._domain}, total: {count}") - return len(urls) + count_after = await conn.fetchval("SELECT COUNT(*) FROM website_urls WHERE domain = $1", self._domain) + inserted = count_after - count_before + logger.info(f"Saved discovered URLs for {self._domain}: {inserted} new, {count_after} total") + return inserted async def get_urls_page(self, offset: int = 0, limit: int = 100, status: str | None = None) -> list[dict]: async with self._pool.acquire() as conn: @@ -128,8 +131,8 @@ async def update_content_hashes(self, updates: list[dict]) -> None: u.get("title"), u.get("content"), u.get("word_count"), - u.get("metadata"), - u.get("structured_data"), + json.dumps(u["metadata"]) if u.get("metadata") else None, + json.dumps(u["structured_data"]) if u.get("structured_data") else None, ) for u in updates ], @@ -206,8 +209,8 @@ async def get_cached_pages(self, urls: list[str]) -> list[dict]: "title": r["title"], "content": r["content"], "word_count": r["word_count"] or 0, - "metadata": json.loads(r["metadata"]) if r["metadata"] else None, - "structured_data": json.loads(r["structured_data"]) if r["structured_data"] else None, + "metadata": r["metadata"], + "structured_data": r["structured_data"], } for r in rows ] @@ -258,11 +261,9 @@ async def update_website_metadata( async def remove_website(self, domain: str) -> bool: self._stores.pop(domain, None) async with self._pool.acquire() as conn: - async with conn.transaction(): - await conn.execute("DELETE FROM chunks WHERE domain = $1", domain) - await conn.execute("DELETE FROM website_urls WHERE domain = $1", domain) - result = await conn.execute("DELETE FROM websites WHERE domain = $1", domain) - deleted = result == "DELETE 1" + # ON DELETE CASCADE on website_urls and chunks handles child row cleanup + result = await conn.execute("DELETE FROM websites WHERE domain = $1", domain) + deleted = result == "DELETE 1" if deleted: logger.info(f"Removed website: {domain}") return deleted @@ -297,9 +298,18 @@ async def update_last_scanned(self, domain: str) -> None: async def get_website(self, domain: str) -> dict | None: async with self._pool.acquire() as conn: row = await conn.fetchrow( - """SELECT domain, title, description, page_count, status, scan_interval, - last_scanned_at, error, created_at, updated_at - FROM websites WHERE domain = $1""", + """SELECT w.domain, w.title, w.description, w.page_count, w.status, + w.scan_interval, w.last_scanned_at, w.error, + w.created_at, w.updated_at, + COALESCE(u.total, 0) AS total_urls, + COALESCE(u.crawled, 0) AS crawled_count + FROM websites w + LEFT JOIN LATERAL ( + SELECT COUNT(*) AS total, + COUNT(*) FILTER (WHERE content_hash IS NOT NULL) AS crawled + FROM website_urls WHERE domain = w.domain + ) u ON true + WHERE w.domain = $1""", domain, ) return dict(row) if row else None diff --git a/services/crawler/app/services/scheduler.py b/services/crawler/app/services/scheduler.py index 9991c69ecb..b23a8dcacb 100644 --- a/services/crawler/app/services/scheduler.py +++ b/services/crawler/app/services/scheduler.py @@ -16,6 +16,7 @@ from app.services.crawler_service import CrawlerService from app.services.indexing_service import IndexingService from app.services.pg_website_store import PgWebsiteStore, PgWebsiteStoreManager +from app.utils.metadata import extract_meta_description logger = logging.getLogger(__name__) @@ -27,6 +28,7 @@ _HEAD_BATCH_SIZE = 50 _scan_trigger: asyncio.Event | None = None +_cancelled_domains: set[str] = set() def _sha256(content: str) -> str: @@ -39,6 +41,19 @@ def trigger_scan(): _scan_trigger.set() +def cancel_scan(domain: str): + """Mark a domain for scan cancellation.""" + _cancelled_domains.add(domain) + + +def _is_cancelled(domain: str) -> bool: + return domain in _cancelled_domains + + +def _clear_cancelled(domain: str): + _cancelled_domains.discard(domain) + + async def run_scheduler( store_manager: PgWebsiteStoreManager, crawler_service: CrawlerService, @@ -170,19 +185,6 @@ def _is_homepage(url: str, domain: str) -> bool: return parsed.netloc == domain and parsed.path in ("", "/") -def _extract_meta_description(structured_data: dict | None) -> str | None: - """Extract meta description from structured data.""" - if not structured_data: - return None - meta = structured_data.get("meta", {}) - if desc := meta.get("description"): - return desc - og = structured_data.get("opengraph", {}) - if desc := og.get("og:description"): - return desc - return None - - async def _bulk_head_check( all_urls: list[str], site_store: PgWebsiteStore, @@ -209,6 +211,7 @@ async def _scan_website( crawler_service: CrawlerService, indexing_service: IndexingService | None = None, ): + _clear_cancelled(domain) site_store = store_manager.get_site_store(domain) await store_manager.update_scan_status(domain, "scanning") @@ -217,12 +220,20 @@ async def _scan_website( await crawler_service.initialize() # Phase 1: Discover new URLs + if _is_cancelled(domain): + logger.info(f"Scan [{domain}]: cancelled before discovery") + await store_manager.update_scan_status(domain, "idle") + return logger.info(f"Scan [{domain}]: Phase 1 — discovering URLs") discovered = await crawler_service.discover_urls(domain=domain, max_urls=-1) await site_store.save_discovered_urls(discovered) logger.info(f"Scan [{domain}]: discovered {len(discovered)} URLs") # Phase 2: Bulk HEAD check — filter unchanged URLs up front + if _is_cancelled(domain): + logger.info(f"Scan [{domain}]: cancelled before HEAD check") + await store_manager.update_scan_status(domain, "idle") + return scan_start = time.time() all_urls = await site_store.get_urls_needing_recrawl(limit=10000, crawled_before=scan_start) if not all_urls: @@ -246,6 +257,10 @@ async def _scan_website( homepage_description: str | None = None for i in range(0, len(needs_crawl), CRAWL_BATCH_SIZE): + if _is_cancelled(domain): + logger.info(f"Scan [{domain}]: cancelled during crawl (crawled {crawled_total} so far)") + await store_manager.update_scan_status(domain, "idle") + return batch = needs_crawl[i : i + CRAWL_BATCH_SIZE] logger.info( f"Scan [{domain}]: Phase 3 — crawling batch {i // CRAWL_BATCH_SIZE + 1} " @@ -263,8 +278,8 @@ async def _scan_website( "title": p.get("title"), "content": p["content"], "word_count": p.get("word_count", 0), - "metadata": json.dumps(p.get("metadata")) if p.get("metadata") else None, - "structured_data": json.dumps(p.get("structured_data")) if p.get("structured_data") else None, + "metadata": p.get("metadata"), + "structured_data": p.get("structured_data"), } for p in results ] @@ -278,7 +293,7 @@ async def _scan_website( sd = p.get("structured_data") if isinstance(sd, str): sd = json.loads(sd) - homepage_description = _extract_meta_description(sd) + homepage_description = extract_meta_description(sd) break if indexing_service: diff --git a/services/crawler/app/utils/metadata.py b/services/crawler/app/utils/metadata.py new file mode 100644 index 0000000000..389e8fb216 --- /dev/null +++ b/services/crawler/app/utils/metadata.py @@ -0,0 +1,14 @@ +"""Shared metadata extraction utilities.""" + + +def extract_meta_description(structured_data: dict | None) -> str | None: + """Extract meta description from structured data (meta tags or OpenGraph).""" + if not structured_data: + return None + meta = structured_data.get("meta", {}) + if desc := meta.get("description"): + return desc + og = structured_data.get("opengraph", {}) + if desc := og.get("og:description"): + return desc + return None diff --git a/services/crawler/tests/test_chunking_service.py b/services/crawler/tests/test_chunking_service.py index 2de3307ae4..3f065939ba 100644 --- a/services/crawler/tests/test_chunking_service.py +++ b/services/crawler/tests/test_chunking_service.py @@ -73,6 +73,39 @@ def test_title_prepended_to_every_chunk(self): assert chunk.content.startswith("Title") +class TestChunkContentWithUrl: + BODY = "Some body text here that is long enough to pass the minimum chunk length filter." + + def test_url_prepended_to_single_chunk(self): + result = chunk_content(self.BODY, url="https://example.com/page") + assert result[0].content.startswith("https://example.com/page\n\n") + assert self.BODY in result[0].content + + def test_none_url_ignored(self): + result = chunk_content(self.BODY, url=None) + assert result[0].content == self.BODY + + def test_empty_url_ignored(self): + result = chunk_content(self.BODY, url="") + assert result[0].content == self.BODY + + def test_whitespace_url_ignored(self): + result = chunk_content(self.BODY, url=" ") + assert result[0].content == self.BODY + + def test_title_and_url_both_in_prefix(self): + result = chunk_content(self.BODY, title="My Title", url="https://example.com/page") + assert result[0].content.startswith("My Title\n\nhttps://example.com/page\n\n") + assert self.BODY in result[0].content + + def test_url_prepended_to_every_chunk(self): + para = "A" * 100 + content = f"{para}\n\n{para}\n\n{para}" + result = chunk_content(content, url="https://example.com", chunk_size=200, chunk_overlap=20) + for chunk in result: + assert "https://example.com" in chunk.content + + class TestChunkContentMultipleParagraphs: def test_two_paragraphs_within_limit_stay_in_one_chunk(self): p1 = "First paragraph with enough content to be meaningful here." diff --git a/services/crawler/tests/test_embedding_service.py b/services/crawler/tests/test_embedding_service.py index 8d4f6a004e..634ec1be0e 100644 --- a/services/crawler/tests/test_embedding_service.py +++ b/services/crawler/tests/test_embedding_service.py @@ -136,21 +136,22 @@ async def test_retries_on_first_failure(self, mock_sleep): assert result == expected assert service._client.embeddings.create.call_count == 2 - mock_sleep.assert_awaited_once() + mock_sleep.assert_awaited_once_with(1.0) @patch("app.services.embedding_service.asyncio.sleep", new_callable=AsyncMock) - async def test_raises_on_second_failure(self, mock_sleep): + async def test_raises_after_all_retries_exhausted(self, mock_sleep): service = create_service(dimensions=2) service._client.embeddings.create.side_effect = [ - RuntimeError("API error"), - RuntimeError("API error again"), + RuntimeError("API error 1"), + RuntimeError("API error 2"), + RuntimeError("API error 3"), ] - with pytest.raises(RuntimeError, match="API error again"): + with pytest.raises(RuntimeError, match="API error 3"): await service.embed_texts(["hello"]) - assert service._client.embeddings.create.call_count == 2 - mock_sleep.assert_awaited_once() + assert service._client.embeddings.create.call_count == 3 + assert mock_sleep.await_count == 2 async def test_no_retry_on_success(self): service = create_service(dimensions=2) diff --git a/services/crawler/tests/test_indexing_service.py b/services/crawler/tests/test_indexing_service.py index bee270cd9a..04e8f1713f 100644 --- a/services/crawler/tests/test_indexing_service.py +++ b/services/crawler/tests/test_indexing_service.py @@ -124,10 +124,13 @@ async def test_returns_zero_when_result_is_empty(self, indexing_service, mock_co class TestIndexWebsite: async def test_aggregates_results_correctly(self, indexing_service, mock_conn): mock_conn.fetch = AsyncMock( - return_value=[ - {"url": "https://example.com/a", "title": "Page A", "content": "aaa"}, - {"url": "https://example.com/b", "title": "Page B", "content": "bbb"}, - {"url": "https://example.com/c", "title": "Page C", "content": "ccc"}, + side_effect=[ + [ + {"url": "https://example.com/a", "title": "Page A", "content": "aaa"}, + {"url": "https://example.com/b", "title": "Page B", "content": "bbb"}, + {"url": "https://example.com/c", "title": "Page C", "content": "ccc"}, + ], + [], ] ) diff --git a/services/crawler/tests/test_pages_router.py b/services/crawler/tests/test_pages_router.py index 22dbd63963..9d4c584643 100644 --- a/services/crawler/tests/test_pages_router.py +++ b/services/crawler/tests/test_pages_router.py @@ -202,3 +202,63 @@ async def test_500_on_database_error(self, mock_pool): assert response.status_code == 500 assert response.json()["detail"] == "Failed to list pages" + + +def _make_chunk_row(**overrides): + defaults = { + "chunk_index": 0, + "chunk_content": "This is chunk content.", + } + defaults.update(overrides) + return FakeRecord(defaults) + + +class TestGetPageChunks: + async def test_success(self, mock_pool): + rows = [ + _make_chunk_row(chunk_index=0, chunk_content="First chunk"), + _make_chunk_row(chunk_index=1, chunk_content="Second chunk"), + ] + mock_pool.fetch.return_value = rows + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get( + "/api/v1/pages/example.com/chunks", + params={"url": "https://example.com/page1"}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["url"] == "https://example.com/page1" + assert data["domain"] == "example.com" + assert data["total"] == 2 + assert len(data["chunks"]) == 2 + assert data["chunks"][0]["chunk_index"] == 0 + assert data["chunks"][0]["chunk_content"] == "First chunk" + assert data["chunks"][1]["chunk_index"] == 1 + + async def test_empty_chunks(self, mock_pool): + mock_pool.fetch.return_value = [] + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get( + "/api/v1/pages/example.com/chunks", + params={"url": "https://example.com/no-chunks"}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["chunks"] == [] + assert data["total"] == 0 + + async def test_500_on_database_error(self, mock_pool): + mock_pool.fetch.side_effect = RuntimeError("connection lost") + + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.get( + "/api/v1/pages/example.com/chunks", + params={"url": "https://example.com/page1"}, + ) + + assert response.status_code == 500 + assert response.json()["detail"] == "Failed to get page chunks" diff --git a/services/crawler/tests/test_websites_router.py b/services/crawler/tests/test_websites_router.py index a46a0eb890..2e5919791f 100644 --- a/services/crawler/tests/test_websites_router.py +++ b/services/crawler/tests/test_websites_router.py @@ -19,6 +19,24 @@ def mock_manager(): del app.state.pg_store_manager +def _website_row(domain="example.com", scan_interval=21600, **overrides): + return { + "domain": domain, + "title": None, + "description": None, + "page_count": 0, + "total_urls": 0, + "crawled_count": 0, + "status": "idle", + "scan_interval": scan_interval, + "last_scanned_at": None, + "error": None, + "created_at": None, + "updated_at": None, + **overrides, + } + + class TestRegisterWebsite: async def test_success(self, mock_manager): mock_manager.register_website.return_value = { @@ -27,7 +45,10 @@ async def test_success(self, mock_manager): "scan_interval": 21600, } - with patch("app.routers.websites.trigger_scan") as mock_trigger: + with ( + patch("app.routers.websites.trigger_scan") as mock_trigger, + patch("app.routers.websites._initialize_website"), + ): async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: response = await client.post( "/api/v1/websites", @@ -37,7 +58,7 @@ async def test_success(self, mock_manager): assert response.status_code == 200 data = response.json() assert data["domain"] == "example.com" - assert data["status"] == "idle" + assert data["status"] == "scanning" assert data["scan_interval"] == 21600 mock_manager.register_website.assert_awaited_once_with( domain="example.com", @@ -45,14 +66,42 @@ async def test_success(self, mock_manager): ) mock_trigger.assert_called_once() + async def test_normalizes_full_url_to_domain(self, mock_manager): + mock_manager.register_website.return_value = { + "domain": "www.wisekey.com", + "status": "idle", + "scan_interval": 21600, + } + mock_manager.get_website.return_value = _website_row(domain="www.wisekey.com") + + with ( + patch("app.routers.websites.trigger_scan"), + patch("app.routers.websites._initialize_website"), + ): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post( + "/api/v1/websites", + json={"domain": "https://www.wisekey.com", "scan_interval": 21600}, + ) + + assert response.status_code == 200 + mock_manager.register_website.assert_awaited_once_with( + domain="www.wisekey.com", + scan_interval=21600, + ) + async def test_uses_default_scan_interval(self, mock_manager): mock_manager.register_website.return_value = { "domain": "example.com", "status": "idle", "scan_interval": 21600, } + mock_manager.get_website.return_value = _website_row() - with patch("app.routers.websites.trigger_scan"): + with ( + patch("app.routers.websites.trigger_scan"), + patch("app.routers.websites._initialize_website"), + ): async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: response = await client.post( "/api/v1/websites", @@ -65,10 +114,37 @@ async def test_uses_default_scan_interval(self, mock_manager): scan_interval=21600, ) + async def test_returns_scanning_status_immediately(self, mock_manager): + mock_manager.register_website.return_value = { + "domain": "example.com", + "status": "idle", + "scan_interval": 21600, + } + + with ( + patch("app.routers.websites.trigger_scan"), + patch("app.routers.websites._initialize_website"), + ): + async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: + response = await client.post( + "/api/v1/websites", + json={"domain": "example.com"}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["title"] is None + assert data["page_count"] == 0 + assert data["crawled_count"] == 0 + assert data["status"] == "scanning" + async def test_500_on_error(self, mock_manager): mock_manager.register_website.side_effect = RuntimeError("db error") - with patch("app.routers.websites.trigger_scan"): + with ( + patch("app.routers.websites.trigger_scan"), + patch("app.routers.websites._initialize_website"), + ): async with AsyncClient(transport=ASGITransport(app=app), base_url="http://test") as client: response = await client.post( "/api/v1/websites", @@ -86,6 +162,8 @@ async def test_success(self, mock_manager): "title": "Example", "description": "An example site", "page_count": 42, + "total_urls": 50, + "crawled_count": 42, "status": "active", "scan_interval": 3600, "last_scanned_at": 1700000000.0, @@ -102,7 +180,8 @@ async def test_success(self, mock_manager): assert data["domain"] == "example.com" assert data["title"] == "Example" assert data["description"] == "An example site" - assert data["page_count"] == 42 + assert data["page_count"] == 50 + assert data["crawled_count"] == 42 assert data["status"] == "active" assert data["scan_interval"] == 3600 assert data["last_scanned_at"] is not None diff --git a/services/platform/app/features/websites/components/website-edit-dialog.tsx b/services/platform/app/features/websites/components/website-edit-dialog.tsx index 4eb1a638e7..7a7688312d 100644 --- a/services/platform/app/features/websites/components/website-edit-dialog.tsx +++ b/services/platform/app/features/websites/components/website-edit-dialog.tsx @@ -15,7 +15,6 @@ import { useT } from '@/lib/i18n/client'; import { useUpdateWebsite } from '../hooks/mutations'; type FormData = { - domain: string; scanInterval: string; }; @@ -36,10 +35,6 @@ export function EditWebsiteDialog({ const formSchema = useMemo( () => z.object({ - domain: z - .string() - .min(1, tWebsites('validation.domainRequired')) - .url(tWebsites('validation.validUrl')), scanInterval: z .string() .min(1, tWebsites('validation.scanIntervalRequired')), @@ -58,7 +53,6 @@ export function EditWebsiteDialog({ ]; const { - register, handleSubmit, formState: { errors }, reset, @@ -67,7 +61,6 @@ export function EditWebsiteDialog({ } = useForm({ resolver: zodResolver(formSchema), defaultValues: { - domain: website.domain, scanInterval: website.scanInterval, }, }); @@ -77,7 +70,6 @@ export function EditWebsiteDialog({ useEffect(() => { if (website) { reset({ - domain: website.domain, scanInterval: website.scanInterval, }); } @@ -87,7 +79,6 @@ export function EditWebsiteDialog({ updateWebsite( { websiteId: website._id, - domain: data.domain, scanInterval: data.scanInterval, }, { @@ -119,12 +110,9 @@ export function EditWebsiteDialog({ >