diff --git a/packages/tale_knowledge/src/tale_knowledge/extraction/image.py b/packages/tale_knowledge/src/tale_knowledge/extraction/image.py
index ebbb36b073..1074f6f487 100644
--- a/packages/tale_knowledge/src/tale_knowledge/extraction/image.py
+++ b/packages/tale_knowledge/src/tale_knowledge/extraction/image.py
@@ -48,7 +48,10 @@ async def extract_text_from_image_bytes(
         Tuple of (extracted_text, vision_was_used).
     """
     if not vision_client:
-        logger.warning(f"No vision client provided for image extraction: {filename}")
+        logger.warning(
+            f"No vision client configured for image extraction: {filename}. "
+            f"Image OCR requires a vision model in provider settings."
+        )
         return "", False
 
     logger.info(f"Processing image: {filename}")
diff --git a/packages/tale_knowledge/src/tale_knowledge/extraction/pdf.py b/packages/tale_knowledge/src/tale_knowledge/extraction/pdf.py
index b48da93548..c605466a41 100644
--- a/packages/tale_knowledge/src/tale_knowledge/extraction/pdf.py
+++ b/packages/tale_knowledge/src/tale_knowledge/extraction/pdf.py
@@ -3,12 +3,20 @@
 Hybrid approach:
 1. Digital PDFs: Extract text directly using PyMuPDF (no API calls)
 2. Embedded images: Large images (>50% page area) are OCR'd, smaller ones described
+
+Scanned page detection:
+- Pages with < SCANNED_PAGE_TEXT_THRESHOLD characters of digital text are
+  considered "scanned" when they also contain at least one embedded image
+  covering > LARGE_IMAGE_RATIO of the page area.
+- The extraction result includes `scanned_pages_detected` and `ocr_applied`
+  so callers can inform the user about OCR activity.
 """
 
 from __future__ import annotations
 
 import asyncio
 from collections.abc import Callable
+from dataclasses import dataclass
 from functools import partial
 from typing import TYPE_CHECKING
 
@@ -23,10 +31,21 @@
 ProgressCallback = Callable[[int, int], None]  # (pages_done, total_pages)
 
 LARGE_IMAGE_RATIO = 0.5
+SCANNED_PAGE_TEXT_THRESHOLD = 50
 MAX_PAGES = 2000
 DEFAULT_PAGE_CONCURRENCY = 8
 
 
+@dataclass(frozen=True, slots=True)
+class PdfExtractionResult:
+    """Result of a PDF text extraction."""
+
+    text: str
+    vision_used: bool
+    scanned_pages_detected: int
+    ocr_applied: bool
+
+
 def _extract_page_text_sync(page_bytes: bytes) -> dict:
     """Extract text and image blocks from a single page (runs in thread pool).
 
@@ -95,11 +114,14 @@ async def _extract_page_with_layout(
     vision_semaphore: asyncio.Semaphore,
     vision_client: VisionClient | None,
     process_images: bool,
-) -> tuple[str, bool]:
+) -> tuple[str, bool, bool]:
     """Extract page content preserving text and image positions.
 
     Images covering >50% of the page area are OCR'd (likely scanned pages),
     smaller images are described.
+
+    Returns:
+        Tuple of (content, vision_used, is_scanned_page).
     """
     loop = asyncio.get_running_loop()
 
@@ -108,8 +130,18 @@ async def _extract_page_with_layout(
     )
     elements: list[tuple[float, str]] = text_data["elements"]
     images: list[tuple[float, bytes, float]] = text_data["images"]
+    total_text_len: int = text_data["total_text_len"]
     vision_used = False
 
+    has_large_image = any(area_ratio > LARGE_IMAGE_RATIO for _, _, area_ratio in images)
+    is_scanned_page = total_text_len < SCANNED_PAGE_TEXT_THRESHOLD and has_large_image
+
+    if is_scanned_page and not vision_client:
+        logger.warning(
+            f"Page {page_num + 1}: Scanned page detected ({total_text_len} chars, "
+            f"large image present) but no vision client configured — OCR skipped"
+        )
+
     if process_images and vision_client and images:
         for y0, img_bytes, area_ratio in images:
             try:
@@ -132,7 +164,7 @@ async def _extract_page_with_layout(
 
     elements.sort(key=lambda x: x[0])
     content = "\n\n".join(elem[1] for elem in elements)
-    return content, vision_used
+    return content, vision_used, is_scanned_page
 
 
 async def extract_text_from_pdf_bytes(
@@ -143,7 +175,7 @@ async def extract_text_from_pdf_bytes(
     process_images: bool = True,
     max_pages: int = MAX_PAGES,
     on_progress: ProgressCallback | None = None,
-) -> tuple[str, bool]:
+) -> PdfExtractionResult:
     """Extract text from PDF bytes.
 
     Args:
@@ -156,7 +188,7 @@ async def extract_text_from_pdf_bytes(
             after each page completes.  Safe to call from concurrent tasks.
 
     Returns:
-        Tuple of (extracted_text, vision_was_used).
+        PdfExtractionResult with text, vision_used, scanned_pages_detected, and ocr_applied.
     """
     logger.info(f"Processing PDF: {filename}")
 
@@ -190,10 +222,12 @@ async def extract_text_from_pdf_bytes(
 
     pages_done = 0
 
-    async def process_page(page_num: int, page_bytes: bytes) -> tuple[int, str, bool]:
+    async def process_page(
+        page_num: int, page_bytes: bytes
+    ) -> tuple[int, str, bool, bool]:
         nonlocal pages_done
         async with page_semaphore:
-            content, vis_used = await _extract_page_with_layout(
+            content, vis_used, is_scanned = await _extract_page_with_layout(
                 page_bytes,
                 page_num,
                 vision_semaphore,
@@ -203,29 +237,52 @@ async def process_page(page_num: int, page_bytes: bytes) -> tuple[int, str, bool
             pages_done += 1
             if on_progress is not None:
                 on_progress(pages_done, pages_to_process)
-            return page_num, f"--- Page {page_num + 1} ---\n{content}", vis_used
+            return (
+                page_num,
+                f"--- Page {page_num + 1} ---\n{content}",
+                vis_used,
+                is_scanned,
+            )
 
     tasks = [process_page(pn, pb) for pn, pb in page_data]
     results = await asyncio.gather(*tasks, return_exceptions=True)
 
     pages_content: list[tuple[int, str]] = []
     vision_used = False
+    scanned_pages_detected = 0
+    ocr_applied = False
 
     for result in results:
         if isinstance(result, Exception):
             logger.warning(f"Page processing failed: {result}")
             continue
-        page_num, content, page_vision_used = result
+        page_num, content, page_vision_used, page_is_scanned = result
         pages_content.append((page_num, content))
         if page_vision_used:
             vision_used = True
+        if page_is_scanned:
+            scanned_pages_detected += 1
+            if page_vision_used:
+                ocr_applied = True
 
     pages_content.sort(key=lambda x: x[0])
     combined_text = "\n\n".join(p[1] for p in pages_content)
 
+    if scanned_pages_detected > 0 and not vision_client:
+        logger.warning(
+            f"PDF '{filename}': {scanned_pages_detected} scanned page(s) detected "
+            f"but no vision client configured — text may be incomplete"
+        )
+
     logger.info(
         f"PDF processing complete: {pages_to_process} pages, {len(combined_text)} chars, "
-        f"Vision API used: {vision_used}"
+        f"Vision API used: {vision_used}, scanned pages: {scanned_pages_detected}, "
+        f"OCR applied: {ocr_applied}"
     )
 
-    return combined_text, vision_used
+    return PdfExtractionResult(
+        text=combined_text,
+        vision_used=vision_used,
+        scanned_pages_detected=scanned_pages_detected,
+        ocr_applied=ocr_applied,
+    )
diff --git a/packages/tale_knowledge/src/tale_knowledge/extraction/router.py b/packages/tale_knowledge/src/tale_knowledge/extraction/router.py
index e16e2cdcd7..5d39ed9ca8 100644
--- a/packages/tale_knowledge/src/tale_knowledge/extraction/router.py
+++ b/packages/tale_knowledge/src/tale_knowledge/extraction/router.py
@@ -67,13 +67,14 @@ async def extract_text(
     if suffix in PDF_EXTENSIONS:
         from .pdf import extract_text_from_pdf_bytes
 
-        return await extract_text_from_pdf_bytes(
+        result = await extract_text_from_pdf_bytes(
             file_bytes,
             filename,
             vision_client=vision_client,
             process_images=process_images,
             on_progress=on_progress,
         )
+        return result.text, result.vision_used
 
     if suffix in DOCX_EXTENSIONS:
         from .docx import extract_text_from_docx_bytes
diff --git a/packages/tale_knowledge/tests/test_pdf.py b/packages/tale_knowledge/tests/test_pdf.py
index 8b60cf9843..f7190a2df5 100644
--- a/packages/tale_knowledge/tests/test_pdf.py
+++ b/packages/tale_knowledge/tests/test_pdf.py
@@ -7,6 +7,7 @@
 
 from tale_knowledge.extraction.pdf import (
     LARGE_IMAGE_RATIO,
+    PdfExtractionResult,
     _extract_page_text_sync,
     extract_text_from_pdf_bytes,
 )
@@ -114,9 +115,12 @@ class TestExtractTextFromPdfBytes:
     @pytest.mark.asyncio
     async def test_digital_pdf_extraction(self):
         pdf_bytes = _make_simple_pdf("Hello World")
-        text, vision_used = await extract_text_from_pdf_bytes(pdf_bytes)
-        assert "Hello World" in text
-        assert vision_used is False
+        result = await extract_text_from_pdf_bytes(pdf_bytes)
+        assert isinstance(result, PdfExtractionResult)
+        assert "Hello World" in result.text
+        assert result.vision_used is False
+        assert result.scanned_pages_detected == 0
+        assert result.ocr_applied is False
 
     @pytest.mark.asyncio
     async def test_multi_page_pdf(self):
@@ -127,11 +131,11 @@ async def test_multi_page_pdf(self):
         pdf_bytes = doc.tobytes()
         doc.close()
 
-        text, vision_used = await extract_text_from_pdf_bytes(pdf_bytes)
-        assert "Page 1" in text
-        assert "Page 2" in text
-        assert "Page 3" in text
-        assert vision_used is False
+        result = await extract_text_from_pdf_bytes(pdf_bytes)
+        assert "Page 1" in result.text
+        assert "Page 2" in result.text
+        assert "Page 3" in result.text
+        assert result.vision_used is False
 
     @pytest.mark.asyncio
     async def test_empty_pdf(self):
@@ -140,17 +144,15 @@ async def test_empty_pdf(self):
         pdf_bytes = doc.tobytes()
         doc.close()
 
-        text, vision_used = await extract_text_from_pdf_bytes(pdf_bytes)
-        assert "--- Page 1 ---" in text
+        result = await extract_text_from_pdf_bytes(pdf_bytes)
+        assert "--- Page 1 ---" in result.text
 
     @pytest.mark.asyncio
     async def test_no_vision_without_client(self):
         pdf_bytes = _make_simple_pdf("Digital text only")
-        text, vision_used = await extract_text_from_pdf_bytes(
-            pdf_bytes, vision_client=None
-        )
-        assert "Digital text only" in text
-        assert vision_used is False
+        result = await extract_text_from_pdf_bytes(pdf_bytes, vision_client=None)
+        assert "Digital text only" in result.text
+        assert result.vision_used is False
 
     @pytest.mark.asyncio
     async def test_image_described_with_vision_client(self):
@@ -161,24 +163,20 @@ async def test_image_described_with_vision_client(self):
         mock_client.max_concurrent_pages = 3
         mock_client.describe_image = AsyncMock(return_value="A red square image")
 
-        text, vision_used = await extract_text_from_pdf_bytes(
-            pdf_bytes, vision_client=mock_client
-        )
-        assert long_text in text
-        assert "[Image: A red square image]" in text
-        assert vision_used is True
+        result = await extract_text_from_pdf_bytes(pdf_bytes, vision_client=mock_client)
+        assert long_text in result.text
+        assert "[Image: A red square image]" in result.text
+        assert result.vision_used is True
         mock_client.describe_image.assert_called()
 
     @pytest.mark.asyncio
     async def test_image_skipped_without_vision_client(self):
         long_text = "This document has embedded images but no vision client provided"
         pdf_bytes = _make_pdf_with_image(long_text, image_size=200)
-        text, vision_used = await extract_text_from_pdf_bytes(
-            pdf_bytes, vision_client=None
-        )
-        assert long_text in text
-        assert "[Image:" not in text
-        assert vision_used is False
+        result = await extract_text_from_pdf_bytes(pdf_bytes, vision_client=None)
+        assert long_text in result.text
+        assert "[Image:" not in result.text
+        assert result.vision_used is False
 
     @pytest.mark.asyncio
     async def test_image_skipped_when_process_images_false(self):
@@ -188,11 +186,11 @@ async def test_image_skipped_when_process_images_false(self):
         mock_client = AsyncMock()
         mock_client.max_concurrent_pages = 3
 
-        text, vision_used = await extract_text_from_pdf_bytes(
+        result = await extract_text_from_pdf_bytes(
             pdf_bytes, vision_client=mock_client, process_images=False
         )
-        assert long_text in text
-        assert "[Image:" not in text
+        assert long_text in result.text
+        assert "[Image:" not in result.text
         mock_client.describe_image.assert_not_called()
 
     @pytest.mark.asyncio
@@ -203,10 +201,19 @@ async def test_large_image_uses_ocr(self):
         mock_client.max_concurrent_pages = 3
         mock_client.ocr_image = AsyncMock(return_value="OCR extracted text from scan")
 
-        text, vision_used = await extract_text_from_pdf_bytes(
-            pdf_bytes, vision_client=mock_client
-        )
-        assert "OCR extracted text from scan" in text
-        assert vision_used is True
+        result = await extract_text_from_pdf_bytes(pdf_bytes, vision_client=mock_client)
+        assert "OCR extracted text from scan" in result.text
+        assert result.vision_used is True
+        assert result.scanned_pages_detected == 1
+        assert result.ocr_applied is True
         mock_client.ocr_image.assert_called()
         mock_client.describe_image.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_scanned_page_without_vision_client(self):
+        pdf_bytes = _make_fullpage_image_pdf()
+
+        result = await extract_text_from_pdf_bytes(pdf_bytes, vision_client=None)
+        assert result.scanned_pages_detected == 1
+        assert result.ocr_applied is False
+        assert result.vision_used is False
diff --git a/services/crawler/app/models.py b/services/crawler/app/models.py
index 9d5a51c31d..a5bb39d263 100644
--- a/services/crawler/app/models.py
+++ b/services/crawler/app/models.py
@@ -376,6 +376,9 @@ class FileMetadataResponse(BaseModel):
     slide_count: int | None = Field(None, description="Number of slides (PPTX)")
     created_at: int | None = Field(None, description="Document creation date (Unix ms)")
     modified_at: int | None = Field(None, description="Document modification date (Unix ms)")
+    scanned_pages_detected: int | None = Field(
+        None, description="Number of pages detected as scanned (low text content)"
+    )
     error: str | None = Field(None, description="Error message if extraction failed")
 
 
diff --git a/services/crawler/app/routers/docx.py b/services/crawler/app/routers/docx.py
index dc53455863..54dfd1640a 100644
--- a/services/crawler/app/routers/docx.py
+++ b/services/crawler/app/routers/docx.py
@@ -499,6 +499,7 @@ async def extract_docx_metadata(file: UploadFile = _FILE_UPLOAD):
             author=meta["author"] or None,
             created_at=meta["created_at"],
             modified_at=meta["modified_at"],
+            scanned_pages_detected=0,
         )
     except HTTPException:
         raise
diff --git a/services/crawler/app/routers/pdf.py b/services/crawler/app/routers/pdf.py
index f83aa98ea3..24f3c9631c 100644
--- a/services/crawler/app/routers/pdf.py
+++ b/services/crawler/app/routers/pdf.py
@@ -251,6 +251,20 @@ async def extract_pdf_metadata(file: UploadFile = _FILE_UPLOAD):
         doc = fitz.open(stream=file_bytes, filetype="pdf")
         raw = doc.metadata or {}
         page_count = len(doc)
+
+        large_image_ratio = 0.5
+        scanned_count = 0
+        for page_num in range(page_count):
+            page = doc[page_num]
+            page_area = page.rect.get_area()
+            if page_area <= 0:
+                continue
+            for img in page.get_images(full=True):
+                bbox = page.get_image_bbox(img)
+                if bbox and bbox.get_area() / page_area > large_image_ratio:
+                    scanned_count += 1
+                    break
+
         doc.close()
 
         return FileMetadataResponse(
@@ -262,6 +276,7 @@ async def extract_pdf_metadata(file: UploadFile = _FILE_UPLOAD):
             page_count=page_count,
             created_at=_parse_pdf_date(raw.get("creationDate")),
             modified_at=_parse_pdf_date(raw.get("modDate")),
+            scanned_pages_detected=scanned_count,
         )
     except HTTPException:
         raise
diff --git a/services/crawler/app/routers/pptx.py b/services/crawler/app/routers/pptx.py
index 9c512a73d0..593cbaaea9 100644
--- a/services/crawler/app/routers/pptx.py
+++ b/services/crawler/app/routers/pptx.py
@@ -301,6 +301,7 @@ async def extract_pptx_metadata(file: UploadFile = _FILE_UPLOAD):
             slide_count=len(prs.slides),
             created_at=meta["created_at"],
             modified_at=meta["modified_at"],
+            scanned_pages_detected=0,
         )
     except HTTPException:
         raise
diff --git a/services/crawler/app/services/file_parser_service.py b/services/crawler/app/services/file_parser_service.py
index c01a6fd8d5..dffaa2827c 100644
--- a/services/crawler/app/services/file_parser_service.py
+++ b/services/crawler/app/services/file_parser_service.py
@@ -204,7 +204,7 @@ async def parse_pdf_with_vision(
         try:
             acc = UsageAccumulator()
 
-            pages_content, vision_used = await extract_text_from_pdf_bytes(
+            pages_content, vision_used, extraction_meta = await extract_text_from_pdf_bytes(
                 file_bytes,
                 filename,
                 process_images=process_images,
@@ -245,6 +245,8 @@ async def parse_pdf_with_vision(
                 "page_count": len(pages_content),
                 "full_text": "\n\n".join(pages_content),
                 "vision_used": vision_used,
+                "scanned_pages_detected": extraction_meta.scanned_pages_detected,
+                "ocr_applied": extraction_meta.ocr_applied,
                 "metadata": pdf_metadata,
             }
             if acc.total_tokens > 0:
diff --git a/services/crawler/app/services/vision/pdf_extractor.py b/services/crawler/app/services/vision/pdf_extractor.py
index 7bf577dce2..6a570d28d3 100644
--- a/services/crawler/app/services/vision/pdf_extractor.py
+++ b/services/crawler/app/services/vision/pdf_extractor.py
@@ -5,9 +5,15 @@
 2. Scanned PDFs: Detect low-text pages and send to Vision API for OCR
 3. Embedded images: Extract and describe using Vision API
 4. Position preservation: Maintain relative positions of text and images
+
+Scanned page detection uses a text threshold of MIN_TEXT_THRESHOLD characters.
+This is consistent with the knowledge extraction pipeline in tale_knowledge
+which uses SCANNED_PAGE_TEXT_THRESHOLD (same value) combined with a large-image
+area ratio check.
 """
 
 import asyncio
+from dataclasses import dataclass
 
 import fitz  # PyMuPDF
 from loguru import logger
@@ -19,6 +25,16 @@
 MIN_IMAGE_SIZE = 10000  # ~100x100 pixels
 
 
+@dataclass
+class PdfExtractionMetadata:
+    """Metadata about the PDF extraction process."""
+
+    scanned_pages_detected: int = 0
+    ocr_applied: bool = False
+    pages_processed: int = 0
+    vision_used: bool = False
+
+
 async def _describe_image(
     doc: fitz.Document,
     xref: int,
@@ -73,7 +89,7 @@ async def _extract_page_with_layout(
     process_images: bool,
     ocr_scanned_pages: bool,
     usage: UsageAccumulator | None = None,
-) -> tuple[str, bool]:
+) -> tuple[str, bool, bool]:
     """Extract page content preserving text and image positions.
 
     Args:
@@ -84,10 +100,11 @@ async def _extract_page_with_layout(
         ocr_scanned_pages: Whether to OCR pages with low text content
 
     Returns:
-        Tuple of (page_content, vision_used)
+        Tuple of (page_content, vision_used, is_scanned_page).
     """
     elements: list[tuple[float, str]] = []  # (y_coord, content)
     vision_used = False
+    is_scanned_page = False
 
     # Get text blocks with positions
     text_dict = page.get_text("dict")
@@ -107,6 +124,7 @@ async def _extract_page_with_layout(
 
     # Check if this is a scanned page (low text content)
     if total_text_len < MIN_TEXT_THRESHOLD and ocr_scanned_pages:
+        is_scanned_page = True
         logger.debug(f"Page {page.number + 1}: Low text ({total_text_len} chars), sending to Vision API for OCR")
         ocr_text = await _ocr_page(page, semaphore, usage=usage)
         if ocr_text:
@@ -138,7 +156,7 @@ async def _extract_page_with_layout(
     elements.sort(key=lambda x: x[0])
     content = "\n\n".join(elem[1] for elem in elements)
 
-    return content, vision_used
+    return content, vision_used, is_scanned_page
 
 
 async def extract_text_from_pdf_bytes(
@@ -148,7 +166,7 @@ async def extract_text_from_pdf_bytes(
     process_images: bool = True,
     ocr_scanned_pages: bool = True,
     usage: UsageAccumulator | None = None,
-) -> tuple[list[str], bool]:
+) -> tuple[list[str], bool, PdfExtractionMetadata]:
     """Extract text from PDF bytes with Vision support.
 
     Args:
@@ -158,7 +176,7 @@ async def extract_text_from_pdf_bytes(
         ocr_scanned_pages: Whether to OCR pages with low text content
 
     Returns:
-        Tuple of (list of page contents, vision_used flag)
+        Tuple of (list of page contents, vision_used flag, extraction metadata).
     """
     logger.info(f"Processing PDF: {filename}")
 
@@ -166,27 +184,33 @@ async def extract_text_from_pdf_bytes(
     total_pages = len(doc)
     semaphore = asyncio.Semaphore(settings.vision_max_concurrent_pages)
 
-    async def process_page(page_num: int) -> tuple[int, str, bool]:
+    async def process_page(page_num: int) -> tuple[int, str, bool, bool]:
         page = doc[page_num]
-        content, vision_used = await _extract_page_with_layout(
+        content, page_vision_used, page_is_scanned = await _extract_page_with_layout(
             page, doc, semaphore, process_images, ocr_scanned_pages, usage=usage
         )
-        return page_num, f"--- Page {page_num + 1} ---\n{content}", vision_used
+        return page_num, f"--- Page {page_num + 1} ---\n{content}", page_vision_used, page_is_scanned
 
     tasks = [process_page(i) for i in range(total_pages)]
     results = await asyncio.gather(*tasks, return_exceptions=True)
 
     pages_content: list[tuple[int, str]] = []
     vision_used = False
+    scanned_pages_detected = 0
+    ocr_applied = False
 
     for result in results:
         if isinstance(result, Exception):
             logger.warning(f"Page processing failed: {result}")
             continue
-        page_num, content, page_vision_used = result
+        page_num, content, page_vision_used, page_is_scanned = result
         pages_content.append((page_num, content))
         if page_vision_used:
             vision_used = True
+        if page_is_scanned:
+            scanned_pages_detected += 1
+            if page_vision_used:
+                ocr_applied = True
 
     doc.close()
 
@@ -194,6 +218,16 @@ async def process_page(page_num: int) -> tuple[int, str, bool]:
     pages_content.sort(key=lambda x: x[0])
     ordered_content = [p[1] for p in pages_content]
 
-    logger.info(f"PDF processing complete: {total_pages} pages, Vision API used: {vision_used}")
+    metadata = PdfExtractionMetadata(
+        scanned_pages_detected=scanned_pages_detected,
+        ocr_applied=ocr_applied,
+        pages_processed=total_pages,
+        vision_used=vision_used,
+    )
+
+    logger.info(
+        f"PDF processing complete: {total_pages} pages, Vision API used: {vision_used}, "
+        f"scanned pages: {scanned_pages_detected}, OCR applied: {ocr_applied}"
+    )
 
-    return ordered_content, vision_used
+    return ordered_content, vision_used, metadata
diff --git a/services/db/init-scripts/03-create-knowledge-database.sql b/services/db/init-scripts/03-create-knowledge-database.sql
index d9d73d1398..ab53b780e4 100644
--- a/services/db/init-scripts/03-create-knowledge-database.sql
+++ b/services/db/init-scripts/03-create-knowledge-database.sql
@@ -175,6 +175,7 @@ CREATE TABLE IF NOT EXISTS private_knowledge.documents (
     progress_detail TEXT,
     source_created_at  TIMESTAMPTZ,
     source_modified_at TIMESTAMPTZ,
+    ocr_applied   BOOLEAN NOT NULL DEFAULT FALSE,
     created_at    TIMESTAMPTZ NOT NULL DEFAULT NOW(),
     updated_at    TIMESTAMPTZ NOT NULL DEFAULT NOW()
 );
diff --git a/services/db/migrations/db/migrations/20260411000002_add_document_ocr_applied_column.sql b/services/db/migrations/db/migrations/20260411000002_add_document_ocr_applied_column.sql
new file mode 100644
index 0000000000..8c10e4ca10
--- /dev/null
+++ b/services/db/migrations/db/migrations/20260411000002_add_document_ocr_applied_column.sql
@@ -0,0 +1,10 @@
+-- migrate:up
+-- Track whether OCR was applied during document extraction.
+
+ALTER TABLE private_knowledge.documents
+  ADD COLUMN IF NOT EXISTS ocr_applied BOOLEAN NOT NULL DEFAULT FALSE;
+
+-- migrate:down
+
+ALTER TABLE private_knowledge.documents
+  DROP COLUMN IF EXISTS ocr_applied;
diff --git a/services/platform/app/features/documents/components/document-preview-dialog.tsx b/services/platform/app/features/documents/components/document-preview-dialog.tsx
index a6ba610458..a26e74e18c 100644
--- a/services/platform/app/features/documents/components/document-preview-dialog.tsx
+++ b/services/platform/app/features/documents/components/document-preview-dialog.tsx
@@ -112,6 +112,19 @@ function DetailsSidebar({ doc }: { doc: Document }) {
         />
       </SidebarRow>
 
+      {doc.scannedPagesDetected != null && doc.scannedPagesDetected > 0 && (
+        <SidebarRow label={t('preview.sidebar.imagePages')}>
+          {String(doc.scannedPagesDetected)}
+          {doc.ragStatus === 'completed' && doc.ocrApplied != null && (
+            <Text variant="label-sm" className="text-muted-foreground">
+              {doc.ocrApplied
+                ? t('ocr.processingWithOcr')
+                : t('ocr.unavailable')}
+            </Text>
+          )}
+        </SidebarRow>
+      )}
+
       <Separator />
 
       {teamNames.length > 0 && (
diff --git a/services/platform/app/features/documents/hooks/use-documents-table-config.tsx b/services/platform/app/features/documents/hooks/use-documents-table-config.tsx
index 723a6ab06e..4f54948fda 100644
--- a/services/platform/app/features/documents/hooks/use-documents-table-config.tsx
+++ b/services/platform/app/features/documents/hooks/use-documents-table-config.tsx
@@ -164,12 +164,24 @@ export function useDocumentsTableConfig({
               —
             </Text>
           ) : (
-            <RagStatusBadge
-              status={row.original.ragStatus}
-              indexedAt={row.original.ragIndexedAt}
-              error={row.original.ragError}
-              documentId={row.original.id}
-            />
+            <HStack gap={2} className="items-center">
+              <RagStatusBadge
+                status={row.original.ragStatus}
+                indexedAt={row.original.ragIndexedAt}
+                error={row.original.ragError}
+                documentId={row.original.id}
+              />
+              {row.original.ragStatus === 'completed' &&
+                row.original.ocrApplied === true && (
+                  <Text
+                    as="span"
+                    variant="label-sm"
+                    className="text-muted-foreground"
+                  >
+                    OCR
+                  </Text>
+                )}
+            </HStack>
           ),
       },
       {
diff --git a/services/platform/convex/agents/mutations.ts b/services/platform/convex/agents/mutations.ts
index 80365dd80e..d7cba9b65a 100644
--- a/services/platform/convex/agents/mutations.ts
+++ b/services/platform/convex/agents/mutations.ts
@@ -167,13 +167,16 @@ export const addKnowledgeFile = mutation({
       name: authUser.name,
     });
 
-    await ctx.db.insert('fileMetadata', {
-      organizationId: args.organizationId,
-      storageId: args.fileId,
-      fileName: args.fileName,
-      contentType: args.contentType,
-      size: args.fileSize,
-    });
+    await ctx.runMutation(
+      internal.file_metadata.internal_mutations.saveFileMetadata,
+      {
+        organizationId: args.organizationId,
+        storageId: args.fileId,
+        fileName: args.fileName,
+        contentType: args.contentType,
+        size: args.fileSize,
+      },
+    );
 
     const existing = await ctx.db
       .query('agentBindings')
diff --git a/services/platform/convex/conversations/send_message_via_integration.ts b/services/platform/convex/conversations/send_message_via_integration.ts
index 3729189c22..fc66cbd552 100644
--- a/services/platform/convex/conversations/send_message_via_integration.ts
+++ b/services/platform/convex/conversations/send_message_via_integration.ts
@@ -63,13 +63,16 @@ export async function sendMessageViaIntegration(
   if (args.attachments && args.attachments.length > 0) {
     attachmentsMeta = await Promise.all(
       args.attachments.map(async (att) => {
-        await ctx.db.insert('fileMetadata', {
-          organizationId: args.organizationId,
-          storageId: att.storageId,
-          fileName: att.fileName,
-          contentType: att.contentType,
-          size: att.size,
-        });
+        await ctx.runMutation(
+          internal.file_metadata.internal_mutations.saveFileMetadata,
+          {
+            organizationId: args.organizationId,
+            storageId: att.storageId,
+            fileName: att.fileName,
+            contentType: att.contentType,
+            size: att.size,
+          },
+        );
         const url = await ctx.storage.getUrl(att.storageId);
         return {
           id: att.storageId,
diff --git a/services/platform/convex/documents/__tests__/create_document_from_upload.test.ts b/services/platform/convex/documents/__tests__/create_document_from_upload.test.ts
index 87b1a7af16..8ff85c5dfc 100644
--- a/services/platform/convex/documents/__tests__/create_document_from_upload.test.ts
+++ b/services/platform/convex/documents/__tests__/create_document_from_upload.test.ts
@@ -28,9 +28,10 @@ vi.mock('../../_generated/server', async (importOriginal) => {
 
 vi.mock('../../_generated/api', () => ({
   internal: {
-    documents: {
-      internal_actions: {
-        extractDocumentDates: 'extractDocumentDates',
+    file_metadata: {
+      internal_mutations: {
+        saveFileMetadata: 'saveFileMetadata',
+        linkDocumentToFile: 'linkDocumentToFile',
       },
     },
   },
@@ -98,6 +99,7 @@ function createMockCtx() {
       insert: vi.fn().mockResolvedValue('fm_new'),
       patch: vi.fn().mockResolvedValue(undefined),
     },
+    runMutation: vi.fn().mockResolvedValue('fm_new'),
     scheduler: {
       runAfter: vi.fn().mockResolvedValue(undefined),
     },
@@ -136,14 +138,14 @@ describe('createDocumentFromUpload', () => {
     await expect(handler(ctx, baseArgs)).rejects.toThrow('Unauthenticated');
   });
 
-  it('inserts file metadata when fileSize is provided', async () => {
+  it('saves file metadata via runMutation when fileSize is provided', async () => {
     mockGetAuthUser.mockResolvedValue(AUTH_USER);
     const ctx = createMockCtx();
     const handler = await getHandler();
 
     await handler(ctx, baseArgs);
 
-    expect(ctx.db.insert).toHaveBeenCalledWith('fileMetadata', {
+    expect(ctx.runMutation).toHaveBeenCalledWith('saveFileMetadata', {
       organizationId: 'org_1',
       storageId: 'storage_1',
       fileName: 'report.pdf',
@@ -153,7 +155,7 @@ describe('createDocumentFromUpload', () => {
     });
   });
 
-  it('skips file metadata insert when fileSize is not provided', async () => {
+  it('skips file metadata save when fileSize is not provided', async () => {
     mockGetAuthUser.mockResolvedValue(AUTH_USER);
     const ctx = createMockCtx();
     const handler = await getHandler();
@@ -161,7 +163,10 @@ describe('createDocumentFromUpload', () => {
 
     await handler(ctx, argsWithoutSize);
 
-    expect(ctx.db.insert).not.toHaveBeenCalled();
+    expect(ctx.runMutation).not.toHaveBeenCalledWith(
+      'saveFileMetadata',
+      expect.anything(),
+    );
   });
 
   it('creates document and returns documentId', async () => {
@@ -227,19 +232,20 @@ describe('createDocumentFromUpload', () => {
     ).rejects.toThrow('Folder not accessible');
   });
 
-  it('patches fileMetadata with documentId after document creation', async () => {
+  it('links document to file after document creation', async () => {
     mockGetAuthUser.mockResolvedValue(AUTH_USER);
     const ctx = createMockCtx();
     const handler = await getHandler();
 
     await handler(ctx, baseArgs);
 
-    expect(ctx.db.patch).toHaveBeenCalledWith('fm_new', {
+    expect(ctx.runMutation).toHaveBeenCalledWith('linkDocumentToFile', {
+      storageId: 'storage_1',
       documentId: 'doc_created',
     });
   });
 
-  it('does not patch fileMetadata when fileSize is not provided', async () => {
+  it('does not link document to file when fileSize is not provided', async () => {
     mockGetAuthUser.mockResolvedValue(AUTH_USER);
     const ctx = createMockCtx();
     const handler = await getHandler();
@@ -247,36 +253,18 @@ describe('createDocumentFromUpload', () => {
 
     await handler(ctx, argsWithoutSize);
 
-    expect(ctx.db.patch).not.toHaveBeenCalled();
-  });
-
-  it('schedules extractDocumentDates for PDF uploads', async () => {
-    mockGetAuthUser.mockResolvedValue(AUTH_USER);
-    const ctx = createMockCtx();
-    const handler = await getHandler();
-
-    await handler(ctx, baseArgs);
-
-    expect(ctx.scheduler.runAfter).toHaveBeenCalledWith(
-      0,
-      'extractDocumentDates',
-      {
-        documentId: 'doc_created',
-        fileId: 'storage_1',
-      },
+    expect(ctx.runMutation).not.toHaveBeenCalledWith(
+      'linkDocumentToFile',
+      expect.anything(),
     );
   });
 
-  it('does not schedule extractDocumentDates for TXT uploads', async () => {
+  it('does not schedule extractDocumentDates (handled by saveFileMetadata)', async () => {
     mockGetAuthUser.mockResolvedValue(AUTH_USER);
     const ctx = createMockCtx();
     const handler = await getHandler();
 
-    await handler(ctx, {
-      ...baseArgs,
-      fileName: 'notes.txt',
-      contentType: 'text/plain',
-    });
+    await handler(ctx, baseArgs);
 
     expect(ctx.scheduler.runAfter).not.toHaveBeenCalled();
   });
diff --git a/services/platform/convex/documents/internal_actions.ts b/services/platform/convex/documents/internal_actions.ts
index 6399f84858..8c4d58ff58 100644
--- a/services/platform/convex/documents/internal_actions.ts
+++ b/services/platform/convex/documents/internal_actions.ts
@@ -2,21 +2,14 @@
 
 import { v } from 'convex/values';
 
-import { extractExtension } from '../../lib/shared/file-types';
 import { fetchJson } from '../../lib/utils/type-cast-helpers';
-import {
-  isRecord,
-  getBoolean,
-  getNumber,
-  getString,
-} from '../../lib/utils/type-guards';
+import { isRecord, getBoolean, getString } from '../../lib/utils/type-guards';
 import { internal } from '../_generated/api';
 import type { Id } from '../_generated/dataModel';
 import { internalAction } from '../_generated/server';
 import { buildDownloadUrl } from '../lib/helpers/public_storage_url';
 import { getRagConfig } from '../lib/helpers/rag_config';
 import { ragAction } from '../workflow_engine/action_defs/rag/rag_action';
-import { getCrawlerUrl } from './generate_document_helpers';
 import type { GenerateDocxResult } from './generate_docx';
 import * as DocumentsHelpers from './helpers';
 import type { GenerateDocumentResult } from './types';
@@ -290,16 +283,25 @@ export const checkRagDocumentStatus = internalAction({
             ? getString(docStatus, 'source_modified_at')
             : undefined,
         );
-
-        // Write dates BEFORE marking completed — if we crash after marking
-        // completed but before writing dates, the polling loop won't retry.
-        if (sourceCreatedAt != null || sourceModifiedAt != null) {
+        const ocrApplied = isRecord(docStatus)
+          ? getBoolean(docStatus, 'ocr_applied')
+          : undefined;
+
+        // Write dates and OCR status BEFORE marking completed — if we crash
+        // after marking completed but before writing, the polling loop won't retry.
+        const datePatch: Record<string, unknown> = {};
+        if (sourceCreatedAt != null)
+          datePatch.sourceCreatedAt = sourceCreatedAt;
+        if (sourceModifiedAt != null)
+          datePatch.sourceModifiedAt = sourceModifiedAt;
+        if (ocrApplied != null) datePatch.ocrApplied = ocrApplied;
+
+        if (Object.keys(datePatch).length > 0) {
           await ctx.runMutation(
             internal.documents.internal_mutations.updateDocumentDates,
             {
               documentId: args.documentId,
-              ...(sourceCreatedAt != null && { sourceCreatedAt }),
-              ...(sourceModifiedAt != null && { sourceModifiedAt }),
+              ...datePatch,
             },
           );
         }
@@ -698,135 +700,3 @@ export const storeRawContent = internalAction({
     };
   },
 });
-
-const EXTRACT_DATES_SUPPORTED_EXTENSIONS = new Set(['pdf', 'docx', 'pptx']);
-const EXTRACT_DATES_RETRY_DELAYS = [30_000, 60_000, 120_000];
-
-export const extractDocumentDates = internalAction({
-  args: {
-    documentId: v.id('documents'),
-    fileId: v.id('_storage'),
-    attempt: v.optional(v.number()),
-  },
-  returns: v.null(),
-  handler: async (ctx, args): Promise<null> => {
-    const attempt = args.attempt ?? 0;
-
-    try {
-      const document = await ctx.runQuery(
-        internal.documents.internal_queries.getDocumentByIdRaw,
-        { documentId: args.documentId },
-      );
-
-      if (!document) {
-        console.warn(
-          `[extractDocumentDates] Document ${args.documentId} not found, skipping`,
-        );
-        return null;
-      }
-
-      if (document.fileId !== args.fileId) {
-        console.warn(
-          `[extractDocumentDates] File changed for document ${args.documentId}, skipping stale extraction`,
-        );
-        return null;
-      }
-
-      const ext = document.extension ?? extractExtension(document.title);
-      if (!ext || !EXTRACT_DATES_SUPPORTED_EXTENSIONS.has(ext)) {
-        console.warn(
-          `[extractDocumentDates] Unsupported extension "${ext}" for document ${args.documentId}`,
-        );
-        return null;
-      }
-
-      const fileUrl = await ctx.storage.getUrl(args.fileId);
-      if (!fileUrl) {
-        console.warn(
-          `[extractDocumentDates] No URL for file ${args.fileId}, skipping`,
-        );
-        return null;
-      }
-
-      const fileResponse = await fetch(fileUrl, {
-        signal: AbortSignal.timeout(30_000),
-      });
-
-      if (!fileResponse.ok) {
-        throw new Error(
-          `Failed to download file: ${fileResponse.status} ${fileResponse.statusText}`,
-        );
-      }
-
-      const fileBlob = await fileResponse.blob();
-
-      const crawlerUrl = getCrawlerUrl();
-      const endpoint = `${crawlerUrl}/api/v1/${ext}/extract-metadata`;
-
-      const formData = new FormData();
-      formData.append('file', fileBlob, document.title ?? `file.${ext}`);
-
-      const metadataResponse = await fetch(endpoint, {
-        method: 'POST',
-        body: formData,
-        signal: AbortSignal.timeout(30_000),
-      });
-
-      if (!metadataResponse.ok) {
-        const errorText = await metadataResponse.text().catch(() => '');
-        throw new Error(
-          `Crawler extract-metadata returned ${metadataResponse.status}: ${errorText}`,
-        );
-      }
-
-      let body: unknown;
-      try {
-        body = await metadataResponse.json();
-      } catch {
-        throw new Error('Crawler returned non-JSON response');
-      }
-
-      if (!isRecord(body)) {
-        throw new Error('Invalid response shape from crawler extract-metadata');
-      }
-
-      const createdAt = getNumber(body, 'created_at');
-      const modifiedAt = getNumber(body, 'modified_at');
-
-      if (createdAt != null || modifiedAt != null) {
-        await ctx.runMutation(
-          internal.documents.internal_mutations.updateDocumentDates,
-          {
-            documentId: args.documentId,
-            sourceCreatedAt: createdAt,
-            sourceModifiedAt: modifiedAt,
-          },
-        );
-      }
-    } catch (error) {
-      const message = error instanceof Error ? error.message : String(error);
-      console.error(
-        `[extractDocumentDates] Error for document ${args.documentId} (attempt ${attempt}): ${message}`,
-      );
-
-      if (attempt < EXTRACT_DATES_RETRY_DELAYS.length) {
-        const retryDelay = EXTRACT_DATES_RETRY_DELAYS[attempt];
-        await ctx.scheduler.runAfter(
-          retryDelay,
-          internal.documents.internal_actions.extractDocumentDates,
-          {
-            documentId: args.documentId,
-            fileId: args.fileId,
-            attempt: attempt + 1,
-          },
-        );
-      } else {
-        console.warn(
-          `[extractDocumentDates] All retries exhausted for document ${args.documentId}: ${message}`,
-        );
-      }
-    }
-
-    return null;
-  },
-});
diff --git a/services/platform/convex/documents/internal_mutations.ts b/services/platform/convex/documents/internal_mutations.ts
index 1914b0cfb9..c4afed0e89 100644
--- a/services/platform/convex/documents/internal_mutations.ts
+++ b/services/platform/convex/documents/internal_mutations.ts
@@ -66,6 +66,8 @@ export const updateDocumentDates = internalMutation({
     documentId: v.id('documents'),
     sourceCreatedAt: v.optional(v.number()),
     sourceModifiedAt: v.optional(v.number()),
+    scannedPagesDetected: v.optional(v.number()),
+    ocrApplied: v.optional(v.boolean()),
   },
   returns: v.null(),
   handler: async (ctx, args) => {
@@ -74,13 +76,19 @@ export const updateDocumentDates = internalMutation({
       return null;
     }
 
-    const patch: Record<string, number> = {};
+    const patch: Record<string, unknown> = {};
     if (args.sourceCreatedAt != null) {
       patch.sourceCreatedAt = args.sourceCreatedAt;
     }
     if (args.sourceModifiedAt != null) {
       patch.sourceModifiedAt = args.sourceModifiedAt;
     }
+    if (args.scannedPagesDetected != null) {
+      patch.scannedPagesDetected = args.scannedPagesDetected;
+    }
+    if (args.ocrApplied != null) {
+      patch.ocrApplied = args.ocrApplied;
+    }
 
     if (Object.keys(patch).length > 0) {
       await ctx.db.patch(args.documentId, patch);
diff --git a/services/platform/convex/documents/mutations.ts b/services/platform/convex/documents/mutations.ts
index ea1ec9c2ef..72bc7d3025 100644
--- a/services/platform/convex/documents/mutations.ts
+++ b/services/platform/convex/documents/mutations.ts
@@ -165,14 +165,17 @@ export const createDocumentFromUpload = mutation({
 
     let fileMetadataId;
     if (args.fileSize != null) {
-      fileMetadataId = await ctx.db.insert('fileMetadata', {
-        organizationId: args.organizationId,
-        storageId: args.fileId,
-        fileName: args.fileName,
-        contentType: args.contentType ?? 'application/octet-stream',
-        size: args.fileSize,
-        uploadedBy: userId,
-      });
+      fileMetadataId = await ctx.runMutation(
+        internal.file_metadata.internal_mutations.saveFileMetadata,
+        {
+          organizationId: args.organizationId,
+          storageId: args.fileId,
+          fileName: args.fileName,
+          contentType: args.contentType ?? 'application/octet-stream',
+          size: args.fileSize,
+          uploadedBy: userId,
+        },
+      );
     }
 
     const result = await createDocument(ctx, {
@@ -189,18 +192,11 @@ export const createDocumentFromUpload = mutation({
     });
 
     if (fileMetadataId) {
-      await ctx.db.patch(fileMetadataId, {
-        documentId: result.documentId,
-      });
-    }
-
-    if (ext && ['pdf', 'docx', 'pptx'].includes(ext)) {
-      await ctx.scheduler.runAfter(
-        0,
-        internal.documents.internal_actions.extractDocumentDates,
+      await ctx.runMutation(
+        internal.file_metadata.internal_mutations.linkDocumentToFile,
         {
+          storageId: args.fileId,
           documentId: result.documentId,
-          fileId: args.fileId,
         },
       );
     }
diff --git a/services/platform/convex/documents/schema.ts b/services/platform/convex/documents/schema.ts
index 6af22018b7..dcb3826931 100644
--- a/services/platform/convex/documents/schema.ts
+++ b/services/platform/convex/documents/schema.ts
@@ -45,6 +45,8 @@ export const documentsTable = defineTable({
     }),
   ),
   indexed: v.optional(v.boolean()),
+  scannedPagesDetected: v.optional(v.number()),
+  ocrApplied: v.optional(v.boolean()),
   sourceCreatedAt: v.optional(v.number()),
   sourceModifiedAt: v.optional(v.number()),
   createdBy: v.optional(v.string()),
diff --git a/services/platform/convex/documents/transform_to_document_item.ts b/services/platform/convex/documents/transform_to_document_item.ts
index 2e8c7746df..285302f4fa 100644
--- a/services/platform/convex/documents/transform_to_document_item.ts
+++ b/services/platform/convex/documents/transform_to_document_item.ts
@@ -107,6 +107,8 @@ export function transformToDocumentItem(
     ragStatus: document.ragInfo?.status,
     ragIndexedAt: document.ragInfo?.indexedAt,
     ragError: document.ragInfo?.error,
+    scannedPagesDetected: document.scannedPagesDetected,
+    ocrApplied: document.ocrApplied,
     teamId: document.teamId ?? null,
     teamIds: document.teamTags ?? [],
     // Creator tracking
diff --git a/services/platform/convex/documents/validators.ts b/services/platform/convex/documents/validators.ts
index 4e00158b41..61f61e334f 100644
--- a/services/platform/convex/documents/validators.ts
+++ b/services/platform/convex/documents/validators.ts
@@ -69,6 +69,8 @@ export const documentItemValidator = v.object({
   ragStatus: v.optional(ragStatusValidator),
   ragIndexedAt: v.optional(v.number()),
   ragError: v.optional(v.string()),
+  scannedPagesDetected: v.optional(v.number()),
+  ocrApplied: v.optional(v.boolean()),
   teamId: v.optional(v.union(v.string(), v.null())),
   teamIds: v.optional(v.array(v.string())),
   createdBy: v.optional(v.string()),
diff --git a/services/platform/convex/file_metadata/actions.ts b/services/platform/convex/file_metadata/actions.ts
index a12c17f662..a39327c7a2 100644
--- a/services/platform/convex/file_metadata/actions.ts
+++ b/services/platform/convex/file_metadata/actions.ts
@@ -2,7 +2,7 @@
 
 import { v } from 'convex/values';
 
-import { isRecord, getString } from '../../lib/utils/type-guards';
+import { isRecord, getBoolean, getString } from '../../lib/utils/type-guards';
 import { internal } from '../_generated/api';
 import { action } from '../_generated/server';
 import { getRagConfig } from '../lib/helpers/rag_config';
@@ -68,9 +68,14 @@ export const checkFileRagStatuses = action({
           : progressPhase || undefined;
 
       if (status === 'completed') {
+        const ocrApplied = getBoolean(docStatus, 'ocr_applied');
         await ctx.runMutation(
           internal.file_metadata.internal_mutations.updateFileRagStatus,
-          { storageId, ragStatus: 'completed' },
+          {
+            storageId,
+            ragStatus: 'completed',
+            ...(ocrApplied != null && { ocrApplied }),
+          },
         );
       } else if (status === 'failed') {
         await ctx.runMutation(
diff --git a/services/platform/convex/file_metadata/internal_actions.ts b/services/platform/convex/file_metadata/internal_actions.ts
index 38366ef723..9e0ea91d62 100644
--- a/services/platform/convex/file_metadata/internal_actions.ts
+++ b/services/platform/convex/file_metadata/internal_actions.ts
@@ -2,8 +2,11 @@
 
 import { v } from 'convex/values';
 
+import { extractExtension } from '../../lib/shared/file-types';
+import { isRecord, getNumber } from '../../lib/utils/type-guards';
 import { internal } from '../_generated/api';
 import { internalAction } from '../_generated/server';
+import { getCrawlerUrl } from '../documents/generate_document_helpers';
 import { getRagConfig } from '../lib/helpers/rag_config';
 import { ragAction } from '../workflow_engine/action_defs/rag/rag_action';
 
@@ -54,3 +57,172 @@ export const uploadFileToRag = internalAction({
     return null;
   },
 });
+
+const EXTRACT_METADATA_EXTENSIONS = new Set(['pdf', 'docx', 'pptx']);
+const IMAGE_CONTENT_TYPES = new Set([
+  'image/jpeg',
+  'image/png',
+  'image/gif',
+  'image/webp',
+]);
+const EXTRACT_METADATA_RETRY_DELAYS = [30_000, 60_000, 120_000];
+
+/**
+ * Extract vision/OCR metadata and document dates for an uploaded file.
+ *
+ * Triggered by saveFileMetadata on new inserts. For PDF/DOCX/PPTX, calls
+ * the crawler extract-metadata endpoint. For images, sets defaults directly.
+ * For other file types (CSV, TXT, XLSX), sets visionRequired=false.
+ */
+export const extractFileMetadata = internalAction({
+  args: {
+    storageId: v.id('_storage'),
+    fileName: v.string(),
+    contentType: v.string(),
+    attempt: v.optional(v.number()),
+  },
+  returns: v.null(),
+  handler: async (ctx, args): Promise<null> => {
+    const attempt = args.attempt ?? 0;
+    const ext = extractExtension(args.fileName);
+
+    // Images: always need vision, no crawler call needed
+    if (IMAGE_CONTENT_TYPES.has(args.contentType)) {
+      await ctx.runMutation(
+        internal.file_metadata.internal_mutations.updateFileVisionMetadata,
+        {
+          storageId: args.storageId,
+          pageCount: 1,
+          scannedPagesDetected: 0,
+          visionRequired: true,
+        },
+      );
+      return null;
+    }
+
+    // PDF/DOCX/PPTX: call crawler extract-metadata
+    if (ext && EXTRACT_METADATA_EXTENSIONS.has(ext)) {
+      try {
+        const fileUrl = await ctx.storage.getUrl(args.storageId);
+        if (!fileUrl) {
+          console.warn(
+            `[extractFileMetadata] No URL for file ${args.storageId}, skipping`,
+          );
+          return null;
+        }
+
+        const fileResponse = await fetch(fileUrl, {
+          signal: AbortSignal.timeout(30_000),
+        });
+        if (!fileResponse.ok) {
+          throw new Error(
+            `Failed to download file: ${fileResponse.status} ${fileResponse.statusText}`,
+          );
+        }
+
+        const fileBlob = await fileResponse.blob();
+        const crawlerUrl = getCrawlerUrl();
+        const endpoint = `${crawlerUrl}/api/v1/${ext}/extract-metadata`;
+
+        const formData = new FormData();
+        formData.append('file', fileBlob, args.fileName);
+
+        const metadataResponse = await fetch(endpoint, {
+          method: 'POST',
+          body: formData,
+          signal: AbortSignal.timeout(30_000),
+        });
+
+        if (!metadataResponse.ok) {
+          const errorText = await metadataResponse.text().catch(() => '');
+          throw new Error(
+            `Crawler extract-metadata returned ${metadataResponse.status}: ${errorText}`,
+          );
+        }
+
+        let body: unknown;
+        try {
+          body = await metadataResponse.json();
+        } catch {
+          throw new Error('Crawler returned non-JSON response');
+        }
+
+        if (!isRecord(body)) {
+          throw new Error(
+            'Invalid response shape from crawler extract-metadata',
+          );
+        }
+
+        const pageCount = getNumber(body, 'page_count');
+        const scannedPagesDetected = getNumber(body, 'scanned_pages_detected');
+        const createdAt = getNumber(body, 'created_at');
+        const modifiedAt = getNumber(body, 'modified_at');
+
+        // Write vision metadata to fileMetadata
+        await ctx.runMutation(
+          internal.file_metadata.internal_mutations.updateFileVisionMetadata,
+          {
+            storageId: args.storageId,
+            pageCount: pageCount ?? undefined,
+            scannedPagesDetected: scannedPagesDetected ?? undefined,
+            visionRequired:
+              scannedPagesDetected != null ? scannedPagesDetected > 0 : false,
+          },
+        );
+
+        // Write dates and scanned page info to linked document (if any)
+        const fileMetadata = await ctx.runQuery(
+          internal.file_metadata.internal_queries.getByStorageId,
+          { storageId: args.storageId },
+        );
+
+        if (fileMetadata?.documentId) {
+          await ctx.runMutation(
+            internal.documents.internal_mutations.updateDocumentDates,
+            {
+              documentId: fileMetadata.documentId,
+              sourceCreatedAt: createdAt,
+              sourceModifiedAt: modifiedAt,
+              scannedPagesDetected: scannedPagesDetected ?? undefined,
+            },
+          );
+        }
+      } catch (error) {
+        const message = error instanceof Error ? error.message : String(error);
+        console.error(
+          `[extractFileMetadata] Error for file ${args.storageId} (attempt ${attempt}): ${message}`,
+        );
+
+        if (attempt < EXTRACT_METADATA_RETRY_DELAYS.length) {
+          const retryDelay = EXTRACT_METADATA_RETRY_DELAYS[attempt];
+          await ctx.scheduler.runAfter(
+            retryDelay,
+            internal.file_metadata.internal_actions.extractFileMetadata,
+            {
+              storageId: args.storageId,
+              fileName: args.fileName,
+              contentType: args.contentType,
+              attempt: attempt + 1,
+            },
+          );
+        } else {
+          console.warn(
+            `[extractFileMetadata] All retries exhausted for file ${args.storageId}: ${message}`,
+          );
+        }
+      }
+      return null;
+    }
+
+    // All other file types: no vision needed
+    await ctx.runMutation(
+      internal.file_metadata.internal_mutations.updateFileVisionMetadata,
+      {
+        storageId: args.storageId,
+        scannedPagesDetected: 0,
+        visionRequired: false,
+      },
+    );
+    return null;
+  },
+});
diff --git a/services/platform/convex/file_metadata/internal_mutations.ts b/services/platform/convex/file_metadata/internal_mutations.ts
index 60cad084d3..f3bd06c2e6 100644
--- a/services/platform/convex/file_metadata/internal_mutations.ts
+++ b/services/platform/convex/file_metadata/internal_mutations.ts
@@ -65,6 +65,16 @@ export const saveFileMetadata = internalMutation({
       },
     );
 
+    await ctx.scheduler.runAfter(
+      0,
+      internal.file_metadata.internal_actions.extractFileMetadata,
+      {
+        storageId: args.storageId,
+        fileName: args.fileName,
+        contentType: args.contentType,
+      },
+    );
+
     try {
       await checkOrganizationRateLimit(
         ctx,
@@ -97,6 +107,7 @@ export const updateFileRagStatus = internalMutation({
     ),
     ragError: v.optional(v.string()),
     ragProgress: v.optional(v.string()),
+    ocrApplied: v.optional(v.boolean()),
   },
   async handler(ctx, args) {
     const metadata = await ctx.db
@@ -112,7 +123,44 @@ export const updateFileRagStatus = internalMutation({
         args.ragStatus === 'completed' || args.ragStatus === 'failed'
           ? undefined
           : args.ragProgress,
+      ...(args.ocrApplied != null && { ocrApplied: args.ocrApplied }),
     });
+
+    // Sync ocrApplied to linked document so the list view can show it
+    if (args.ocrApplied != null && metadata.documentId) {
+      const doc = await ctx.db.get(metadata.documentId);
+      if (doc) {
+        await ctx.db.patch(metadata.documentId, {
+          ocrApplied: args.ocrApplied,
+        });
+      }
+    }
+  },
+});
+
+export const updateFileVisionMetadata = internalMutation({
+  args: {
+    storageId: v.id('_storage'),
+    pageCount: v.optional(v.number()),
+    scannedPagesDetected: v.optional(v.number()),
+    visionRequired: v.optional(v.boolean()),
+  },
+  async handler(ctx, args) {
+    const metadata = await ctx.db
+      .query('fileMetadata')
+      .withIndex('by_storageId', (q) => q.eq('storageId', args.storageId))
+      .first();
+    if (!metadata) return;
+
+    const patch: Record<string, unknown> = {};
+    if (args.pageCount != null) patch.pageCount = args.pageCount;
+    if (args.scannedPagesDetected != null)
+      patch.scannedPagesDetected = args.scannedPagesDetected;
+    if (args.visionRequired != null) patch.visionRequired = args.visionRequired;
+
+    if (Object.keys(patch).length > 0) {
+      await ctx.db.patch(metadata._id, patch);
+    }
   },
 });
 
diff --git a/services/platform/convex/file_metadata/mutations.ts b/services/platform/convex/file_metadata/mutations.ts
index 8698e5afdc..4f25748586 100644
--- a/services/platform/convex/file_metadata/mutations.ts
+++ b/services/platform/convex/file_metadata/mutations.ts
@@ -84,6 +84,16 @@ export const saveFileMetadata = mutation({
       },
     );
 
+    await ctx.scheduler.runAfter(
+      0,
+      internal.file_metadata.internal_actions.extractFileMetadata,
+      {
+        storageId: args.storageId,
+        fileName: args.fileName,
+        contentType: args.contentType,
+      },
+    );
+
     try {
       await checkOrganizationRateLimit(
         ctx,
diff --git a/services/platform/convex/file_metadata/queries.ts b/services/platform/convex/file_metadata/queries.ts
index 3033b2d502..db0a927c2c 100644
--- a/services/platform/convex/file_metadata/queries.ts
+++ b/services/platform/convex/file_metadata/queries.ts
@@ -27,6 +27,43 @@ export const getUserStorageUsage = query({
   },
 });
 
+export const getByDocumentId = query({
+  args: {
+    organizationId: v.string(),
+    documentId: v.id('documents'),
+  },
+  returns: v.union(
+    v.object({
+      pageCount: v.optional(v.number()),
+      scannedPagesDetected: v.optional(v.number()),
+      visionRequired: v.optional(v.boolean()),
+      ocrApplied: v.optional(v.boolean()),
+    }),
+    v.null(),
+  ),
+  handler: async (ctx, args) => {
+    const authUser = await getAuthUserIdentity(ctx);
+    if (!authUser) return null;
+
+    const meta = await ctx.db
+      .query('fileMetadata')
+      .withIndex('by_organizationId_and_documentId', (q) =>
+        q
+          .eq('organizationId', args.organizationId)
+          .eq('documentId', args.documentId),
+      )
+      .first();
+    if (!meta) return null;
+
+    return {
+      pageCount: meta.pageCount,
+      scannedPagesDetected: meta.scannedPagesDetected,
+      visionRequired: meta.visionRequired,
+      ocrApplied: meta.ocrApplied,
+    };
+  },
+});
+
 export const getByStorageIds = query({
   args: {
     storageIds: v.array(v.id('_storage')),
@@ -48,6 +85,9 @@ export const getByStorageIds = query({
       ),
       ragError: v.optional(v.string()),
       ragProgress: v.optional(v.string()),
+      pageCount: v.optional(v.number()),
+      scannedPagesDetected: v.optional(v.number()),
+      visionRequired: v.optional(v.boolean()),
     }),
   ),
   handler: async (ctx, args) => {
@@ -70,6 +110,9 @@ export const getByStorageIds = query({
           ragStatus: meta.ragStatus,
           ragError: meta.ragError,
           ragProgress: meta.ragProgress,
+          pageCount: meta.pageCount,
+          scannedPagesDetected: meta.scannedPagesDetected,
+          visionRequired: meta.visionRequired,
         };
       }),
     );
diff --git a/services/platform/convex/file_metadata/schema.ts b/services/platform/convex/file_metadata/schema.ts
index b87dfbdb77..e72d244b8d 100644
--- a/services/platform/convex/file_metadata/schema.ts
+++ b/services/platform/convex/file_metadata/schema.ts
@@ -9,6 +9,10 @@ export const fileMetadataTable = defineTable({
   fileName: v.string(),
   contentType: v.string(),
   size: v.number(),
+  pageCount: v.optional(v.number()),
+  scannedPagesDetected: v.optional(v.number()),
+  visionRequired: v.optional(v.boolean()),
+  ocrApplied: v.optional(v.boolean()),
   ragStatus: v.optional(
     v.union(
       v.literal('queued'),
diff --git a/services/platform/messages/de.json b/services/platform/messages/de.json
index 08c26f5a50..f9fc4d6bcf 100644
--- a/services/platform/messages/de.json
+++ b/services/platform/messages/de.json
@@ -2972,7 +2972,8 @@
         "modified": "Geändert",
         "sourceUpload": "Upload",
         "sourceOnedrive": "OneDrive",
-        "sourceSharepoint": "SharePoint"
+        "sourceSharepoint": "SharePoint",
+        "imagePages": "Bildseiten"
       }
     },
     "upload": {
@@ -3013,6 +3014,12 @@
       "extensionNotAllowed": "{name}: .{ext} ist nicht erlaubt. Erlaubte Typen: {allowed}.",
       "uploadDocuments": "Hochladen"
     },
+    "ocr": {
+      "available": "OCR verfügbar",
+      "unavailable": "OCR nicht verfügbar",
+      "processingWithOcr": "Dokument mit OCR verarbeitet",
+      "noVisionModelWarning": "Kein Vision-Modell konfiguriert — gescannte Dokumente und Bilder werden nicht verarbeitet"
+    },
     "import": {
       "noResponseBody": "Kein Antwortinhalt erhalten"
     },
diff --git a/services/platform/messages/en.json b/services/platform/messages/en.json
index f970e9d525..8fb5055825 100644
--- a/services/platform/messages/en.json
+++ b/services/platform/messages/en.json
@@ -2997,7 +2997,8 @@
         "modified": "Modified",
         "sourceUpload": "Upload",
         "sourceOnedrive": "OneDrive",
-        "sourceSharepoint": "SharePoint"
+        "sourceSharepoint": "SharePoint",
+        "imagePages": "Image pages"
       }
     },
     "upload": {
@@ -3038,6 +3039,12 @@
       "extensionNotAllowed": "{name}: .{ext} is not allowed. Allowed types: {allowed}.",
       "uploadDocuments": "Upload"
     },
+    "ocr": {
+      "available": "OCR available",
+      "unavailable": "OCR unavailable",
+      "processingWithOcr": "Document processed with OCR",
+      "noVisionModelWarning": "No vision model configured — scanned documents and images will not be processed"
+    },
     "import": {
       "noResponseBody": "No response body received"
     },
diff --git a/services/platform/types/documents.ts b/services/platform/types/documents.ts
index cfd3485a0b..348b2cfbfa 100644
--- a/services/platform/types/documents.ts
+++ b/services/platform/types/documents.ts
@@ -26,6 +26,10 @@ export interface DocumentItem {
   ragIndexedAt?: number;
   /** Error message (for failed status) */
   ragError?: string;
+  /** Number of scanned pages detected in the document */
+  scannedPagesDetected?: number;
+  /** Whether OCR was applied during RAG indexing */
+  ocrApplied?: boolean;
   teamId?: string | null;
   teamIds?: string[];
   /** User ID who created/uploaded this document */
diff --git a/services/rag/app/models.py b/services/rag/app/models.py
index 8f504e09b8..a49e6c414c 100644
--- a/services/rag/app/models.py
+++ b/services/rag/app/models.py
@@ -144,6 +144,7 @@ class DocumentStatusInfo(BaseModel):
     source_modified_at: dt.datetime | None = Field(
         default=None, description="Original file modification date (from file metadata)"
     )
+    ocr_applied: bool | None = Field(default=None, description="Whether OCR was applied during text extraction")
 
 
 class DocumentStatusRequest(BaseModel):
diff --git a/services/rag/app/routers/documents.py b/services/rag/app/routers/documents.py
index 0c40d54eb5..30fb5dea77 100644
--- a/services/rag/app/routers/documents.py
+++ b/services/rag/app/routers/documents.py
@@ -559,6 +559,7 @@ async def get_document_statuses(request: DocumentStatusRequest):
                 progress_detail=info.get("progress_detail"),
                 source_created_at=info.get("source_created_at"),
                 source_modified_at=info.get("source_modified_at"),
+                ocr_applied=info.get("ocr_applied"),
             )
             if info
             else None
diff --git a/services/rag/app/services/indexing_service.py b/services/rag/app/services/indexing_service.py
index 2c8d9d063b..cf8376033c 100644
--- a/services/rag/app/services/indexing_service.py
+++ b/services/rag/app/services/indexing_service.py
@@ -429,8 +429,8 @@ async def _do_store(
             f"""
                 INSERT INTO {SCHEMA}.documents
                     (file_id, filename, content_hash, status, chunks_count,
-                     source_created_at, source_modified_at)
-                VALUES ($1, $2, $3, 'completed', $4, $5, $6)
+                     source_created_at, source_modified_at, ocr_applied)
+                VALUES ($1, $2, $3, 'completed', $4, $5, $6, $7)
                 RETURNING id
                 """,
             file_id,
@@ -439,6 +439,7 @@ async def _do_store(
             len(prepared.chunks),
             prepared.source_created_at,
             prepared.source_modified_at,
+            prepared.vision_used,
         )
         doc_uuid = doc_row["id"]
 
@@ -543,6 +544,43 @@ async def index_document(
     content, clone its chunks instead of re-extracting/embedding.
     """
     content_hash = compute_content_hash(content_bytes)
+
+    # Fast path: same file_id with unchanged content AND chunks already stored —
+    # skip immediately instead of re-extracting/embedding.
+    async with acquire_with_retry(pool) as conn:
+        own_row = await conn.fetchrow(
+            f"""SELECT d.content_hash,
+                       (SELECT COUNT(*)
+                        FROM {SCHEMA}.chunks c
+                        WHERE c.document_id = d.id) AS chunk_count
+                FROM {SCHEMA}.documents d
+                WHERE d.file_id = $1""",
+            file_id,
+        )
+    if own_row and own_row["content_hash"] == content_hash and own_row["chunk_count"] > 0:
+        logger.info(
+            "Document {} content unchanged with {} chunks, skipping (early dedup)",
+            file_id,
+            own_row["chunk_count"],
+        )
+        async with acquire_with_retry(pool) as conn:
+            await conn.execute(
+                f"""UPDATE {SCHEMA}.documents
+                    SET status = 'completed', error = NULL,
+                        progress_phase = NULL,
+                        progress_detail = NULL,
+                        updated_at = NOW()
+                    WHERE file_id = $1""",
+                file_id,
+            )
+        return {
+            "success": True,
+            "file_id": file_id,
+            "chunks_created": 0,
+            "skipped": True,
+            "skip_reason": "content_unchanged",
+        }
+
     source_id = await find_existing_by_hash(pool, content_hash)
 
     if source_id is not None:
diff --git a/services/rag/app/services/rag_service.py b/services/rag/app/services/rag_service.py
index 4101d224bd..4f6cb260cf 100644
--- a/services/rag/app/services/rag_service.py
+++ b/services/rag/app/services/rag_service.py
@@ -526,7 +526,7 @@ async def get_document_statuses(
                 f"""
                 SELECT DISTINCT ON (file_id)
                     file_id, status, error, progress_phase, progress_detail,
-                    source_created_at, source_modified_at
+                    source_created_at, source_modified_at, ocr_applied
                 FROM {SCHEMA}.documents
                 WHERE file_id = ANY($1)
                 ORDER BY file_id,
@@ -549,6 +549,7 @@ async def get_document_statuses(
                 "progress_detail": row["progress_detail"],
                 "source_created_at": row["source_created_at"],
                 "source_modified_at": row["source_modified_at"],
+                "ocr_applied": row["ocr_applied"],
             }
             for row in rows
         }
diff --git a/services/rag/tests/test_background_ingest.py b/services/rag/tests/test_background_ingest.py
index b4f00f8102..819fb81d7c 100644
--- a/services/rag/tests/test_background_ingest.py
+++ b/services/rag/tests/test_background_ingest.py
@@ -74,6 +74,7 @@ async def test_returns_status_for_found_documents(self):
                     "progress_detail": None,
                     "source_created_at": None,
                     "source_modified_at": None,
+                    "ocr_applied": False,
                 },
                 {
                     "file_id": "doc-2",
@@ -83,6 +84,7 @@ async def test_returns_status_for_found_documents(self):
                     "progress_detail": None,
                     "source_created_at": None,
                     "source_modified_at": None,
+                    "ocr_applied": False,
                 },
             ]
         )
@@ -111,6 +113,7 @@ async def test_returns_error_field_for_failed_documents(self):
                     "progress_detail": None,
                     "source_created_at": None,
                     "source_modified_at": None,
+                    "ocr_applied": False,
                 },
             ]
         )
diff --git a/services/rag/tests/test_file_dates.py b/services/rag/tests/test_file_dates.py
index 01d3e4beea..da0a52cfb8 100644
--- a/services/rag/tests/test_file_dates.py
+++ b/services/rag/tests/test_file_dates.py
@@ -276,7 +276,7 @@ async def test_caller_dates_override_file_dates(self):
         caller_created = dt.datetime(2022, 6, 1, tzinfo=dt.UTC)
 
         mock_conn = AsyncMock()
-        mock_conn.fetchrow = AsyncMock(side_effect=[None, {"id": "uuid-1"}])
+        mock_conn.fetchrow = AsyncMock(side_effect=[None, None, {"id": "uuid-1"}])
         mock_conn.executemany = AsyncMock()
         mock_tx = AsyncMock()
         mock_tx.__aenter__ = AsyncMock(return_value=mock_tx)
@@ -317,10 +317,11 @@ async def test_caller_dates_override_file_dates(self):
 
         assert result["success"] is True
 
-        # The INSERT query has 6 positional args:
+        # The INSERT query has 7 positional args:
         # $1=file_id, $2=filename, $3=content_hash, $4=chunks_count,
-        # $5=source_created_at, $6=source_modified_at
-        insert_call = mock_conn.fetchrow.call_args_list[1]
+        # $5=source_created_at, $6=source_modified_at, $7=ocr_applied
+        # call_args_list[0] = early-dedup check, [1] = cross-scope dedup, [2] = INSERT
+        insert_call = mock_conn.fetchrow.call_args_list[2]
         args = insert_call[0]
         # args[0] is the SQL string, positional params start at args[1]
         source_created_arg = args[5]  # $5
diff --git a/services/rag/tests/test_indexing_service.py b/services/rag/tests/test_indexing_service.py
index 08fec12ca8..a395415f8b 100644
--- a/services/rag/tests/test_indexing_service.py
+++ b/services/rag/tests/test_indexing_service.py
@@ -52,6 +52,7 @@ def _mock_pool(
     mock_conn = AsyncMock()
     mock_conn.fetchrow = AsyncMock(
         side_effect=[
+            None,  # early-dedup check (no existing row with same content)
             existing_row,
             {"id": inserted_doc_id},
         ]
@@ -248,20 +249,17 @@ class TestContentHashDedup:
     async def test_skips_unchanged_content(self):
         from app.services.indexing_service import index_document
 
-        existing = {"id": "existing-uuid", "content_hash": SAMPLE_HASH}
-        pool, mock_conn = _mock_pool(existing_row=existing)
+        pool, mock_conn = _mock_pool()
+        # Early-dedup fetchrow returns a row with matching hash and existing chunks
+        mock_conn.fetchrow = AsyncMock(
+            return_value={"content_hash": SAMPLE_HASH, "chunk_count": 5},
+        )
         mock_embed = AsyncMock()
         mock_embed.embed_texts = AsyncMock(return_value=SAMPLE_EMBEDDINGS)
 
         with (
             _patch_acquire(mock_conn),
             patch("app.services.indexing_service.compute_content_hash", return_value=SAMPLE_HASH),
-            patch(
-                "app.services.indexing_service.extract_text",
-                new_callable=AsyncMock,
-                return_value=("Some text", False),
-            ),
-            patch("app.services.indexing_service.chunk_content", return_value=SAMPLE_CHUNKS),
         ):
             result = await index_document(
                 pool,
@@ -284,12 +282,12 @@ async def test_reindexes_when_content_changed(self):
         mock_embed.embed_texts = AsyncMock(return_value=SAMPLE_EMBEDDINGS)
 
         # The connection is used multiple times:
-        # 1. fetchrow for dedup check -> returns existing row
-        # 2. execute for DELETE old doc
+        # 1. fetchrow for early-dedup check -> returns row with different hash
+        # 2. fetchrow for cross-scope dedup -> returns existing row
         # 3. fetchrow for INSERT new doc RETURNING id
-        # 4. execute for each chunk INSERT
         mock_conn.fetchrow = AsyncMock(
             side_effect=[
+                {"content_hash": DIFFERENT_HASH, "chunk_count": 3},
                 existing,
                 {"id": "new-uuid"},
             ]
@@ -334,6 +332,7 @@ async def test_replacement_deletes_chunks_before_document(self):
 
         mock_conn.fetchrow = AsyncMock(
             side_effect=[
+                {"content_hash": DIFFERENT_HASH, "chunk_count": 3},
                 existing,
                 {"id": "new-uuid"},
             ]
@@ -381,6 +380,7 @@ async def test_replacement_uses_transaction(self):
 
         mock_conn.fetchrow = AsyncMock(
             side_effect=[
+                {"content_hash": DIFFERENT_HASH, "chunk_count": 3},
                 existing,
                 {"id": "new-uuid"},
             ]
@@ -643,6 +643,8 @@ async def test_non_hnsw_internal_error_not_retried(self):
         from app.services.indexing_service import store_prepared_document, PreparedDocument
 
         pool, mock_conn = _mock_pool(existing_row=None)
+        # store_prepared_document calls _do_store directly (no early-dedup)
+        mock_conn.fetchrow = AsyncMock(side_effect=[None, {"id": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"}])
 
         prepared = PreparedDocument(
             content_hash=SAMPLE_HASH,