tale-project · larryro · Apr 11, 2026 · Apr 11, 2026 · Apr 11, 2026 · Apr 11, 2026
diff --git a/packages/tale_knowledge/src/tale_knowledge/extraction/image.py b/packages/tale_knowledge/src/tale_knowledge/extraction/image.py
@@ -48,7 +48,10 @@ async def extract_text_from_image_bytes(
         Tuple of (extracted_text, vision_was_used).
     """
     if not vision_client:
-        logger.warning(f"No vision client provided for image extraction: {filename}")
+        logger.warning(
+            f"No vision client configured for image extraction: {filename}. "
+            f"Image OCR requires a vision model in provider settings."
+        )
         return "", False
 
     logger.info(f"Processing image: {filename}")

diff --git a/packages/tale_knowledge/src/tale_knowledge/extraction/pdf.py b/packages/tale_knowledge/src/tale_knowledge/extraction/pdf.py
@@ -3,12 +3,20 @@
 Hybrid approach:
 1. Digital PDFs: Extract text directly using PyMuPDF (no API calls)
 2. Embedded images: Large images (>50% page area) are OCR'd, smaller ones described
+
+Scanned page detection:
+- Pages with < SCANNED_PAGE_TEXT_THRESHOLD characters of digital text are
+  considered "scanned" when they also contain at least one embedded image
+  covering > LARGE_IMAGE_RATIO of the page area.
+- The extraction result includes `scanned_pages_detected` and `ocr_applied`
+  so callers can inform the user about OCR activity.
 """
 
 from __future__ import annotations
 
 import asyncio
 from collections.abc import Callable
+from dataclasses import dataclass
 from functools import partial
 from typing import TYPE_CHECKING
 
@@ -23,10 +31,21 @@
 ProgressCallback = Callable[[int, int], None]  # (pages_done, total_pages)
 
 LARGE_IMAGE_RATIO = 0.5
+SCANNED_PAGE_TEXT_THRESHOLD = 50
 MAX_PAGES = 2000
 DEFAULT_PAGE_CONCURRENCY = 8
 
 
+@dataclass(frozen=True, slots=True)
+class PdfExtractionResult:
+    """Result of a PDF text extraction."""
+
+    text: str
+    vision_used: bool
+    scanned_pages_detected: int
+    ocr_applied: bool
+
+
 def _extract_page_text_sync(page_bytes: bytes) -> dict:
     """Extract text and image blocks from a single page (runs in thread pool).
 
@@ -95,11 +114,14 @@ async def _extract_page_with_layout(
     vision_semaphore: asyncio.Semaphore,
     vision_client: VisionClient | None,
     process_images: bool,
-) -> tuple[str, bool]:
+) -> tuple[str, bool, bool]:
     """Extract page content preserving text and image positions.
 
     Images covering >50% of the page area are OCR'd (likely scanned pages),
     smaller images are described.
+
+    Returns:
+        Tuple of (content, vision_used, is_scanned_page).
     """
     loop = asyncio.get_running_loop()
 
@@ -108,8 +130,18 @@ async def _extract_page_with_layout(
     )
     elements: list[tuple[float, str]] = text_data["elements"]
     images: list[tuple[float, bytes, float]] = text_data["images"]
+    total_text_len: int = text_data["total_text_len"]
     vision_used = False
 
+    has_large_image = any(area_ratio > LARGE_IMAGE_RATIO for _, _, area_ratio in images)
+    is_scanned_page = total_text_len < SCANNED_PAGE_TEXT_THRESHOLD and has_large_image
+
+    if is_scanned_page and not vision_client:
+        logger.warning(
+            f"Page {page_num + 1}: Scanned page detected ({total_text_len} chars, "
+            f"large image present) but no vision client configured — OCR skipped"
+        )
+
     if process_images and vision_client and images:
         for y0, img_bytes, area_ratio in images:
             try:
@@ -132,7 +164,7 @@ async def _extract_page_with_layout(
 
     elements.sort(key=lambda x: x[0])
     content = "\n\n".join(elem[1] for elem in elements)
-    return content, vision_used
+    return content, vision_used, is_scanned_page
 
 
 async def extract_text_from_pdf_bytes(
@@ -143,7 +175,7 @@ async def extract_text_from_pdf_bytes(
     process_images: bool = True,
     max_pages: int = MAX_PAGES,
     on_progress: ProgressCallback | None = None,
-) -> tuple[str, bool]:
+) -> PdfExtractionResult:
     """Extract text from PDF bytes.
 
     Args:
@@ -156,7 +188,7 @@ async def extract_text_from_pdf_bytes(
             after each page completes.  Safe to call from concurrent tasks.
 
     Returns:
-        Tuple of (extracted_text, vision_was_used).
+        PdfExtractionResult with text, vision_used, scanned_pages_detected, and ocr_applied.
     """
     logger.info(f"Processing PDF: {filename}")
 
@@ -190,10 +222,12 @@ async def extract_text_from_pdf_bytes(
 
     pages_done = 0
 
-    async def process_page(page_num: int, page_bytes: bytes) -> tuple[int, str, bool]:
+    async def process_page(
+        page_num: int, page_bytes: bytes
+    ) -> tuple[int, str, bool, bool]:
         nonlocal pages_done
         async with page_semaphore:
-            content, vis_used = await _extract_page_with_layout(
+            content, vis_used, is_scanned = await _extract_page_with_layout(
                 page_bytes,
                 page_num,
                 vision_semaphore,
@@ -203,29 +237,52 @@ async def process_page(page_num: int, page_bytes: bytes) -> tuple[int, str, bool
             pages_done += 1
             if on_progress is not None:
                 on_progress(pages_done, pages_to_process)
-            return page_num, f"--- Page {page_num + 1} ---\n{content}", vis_used
+            return (
+                page_num,
+                f"--- Page {page_num + 1} ---\n{content}",
+                vis_used,
+                is_scanned,
+            )
 
     tasks = [process_page(pn, pb) for pn, pb in page_data]
     results = await asyncio.gather(*tasks, return_exceptions=True)
 
     pages_content: list[tuple[int, str]] = []
     vision_used = False
+    scanned_pages_detected = 0
+    ocr_applied = False
 
     for result in results:
         if isinstance(result, Exception):
             logger.warning(f"Page processing failed: {result}")
             continue
-        page_num, content, page_vision_used = result
+        page_num, content, page_vision_used, page_is_scanned = result
         pages_content.append((page_num, content))
         if page_vision_used:
             vision_used = True
+        if page_is_scanned:
+            scanned_pages_detected += 1
+            if page_vision_used:
+                ocr_applied = True
 
     pages_content.sort(key=lambda x: x[0])
     combined_text = "\n\n".join(p[1] for p in pages_content)
 
+    if scanned_pages_detected > 0 and not vision_client:
+        logger.warning(
+            f"PDF '{filename}': {scanned_pages_detected} scanned page(s) detected "
+            f"but no vision client configured — text may be incomplete"
+        )
+
     logger.info(
         f"PDF processing complete: {pages_to_process} pages, {len(combined_text)} chars, "
-        f"Vision API used: {vision_used}"
+        f"Vision API used: {vision_used}, scanned pages: {scanned_pages_detected}, "
+        f"OCR applied: {ocr_applied}"
     )
 
-    return combined_text, vision_used
+    return PdfExtractionResult(
+        text=combined_text,
+        vision_used=vision_used,
+        scanned_pages_detected=scanned_pages_detected,
+        ocr_applied=ocr_applied,
+    )
diff --git a/packages/tale_knowledge/src/tale_knowledge/extraction/router.py b/packages/tale_knowledge/src/tale_knowledge/extraction/router.py
@@ -67,13 +67,14 @@ async def extract_text(
     if suffix in PDF_EXTENSIONS:
         from .pdf import extract_text_from_pdf_bytes
 
-        return await extract_text_from_pdf_bytes(
+        result = await extract_text_from_pdf_bytes(
             file_bytes,
             filename,
             vision_client=vision_client,
             process_images=process_images,
             on_progress=on_progress,
         )
+        return result.text, result.vision_used
 
     if suffix in DOCX_EXTENSIONS:
         from .docx import extract_text_from_docx_bytes

diff --git a/packages/tale_knowledge/tests/test_pdf.py b/packages/tale_knowledge/tests/test_pdf.py
@@ -7,6 +7,7 @@
 
 from tale_knowledge.extraction.pdf import (
     LARGE_IMAGE_RATIO,
+    PdfExtractionResult,
     _extract_page_text_sync,
     extract_text_from_pdf_bytes,
 )
@@ -114,9 +115,12 @@ class TestExtractTextFromPdfBytes:
     @pytest.mark.asyncio
     async def test_digital_pdf_extraction(self):
         pdf_bytes = _make_simple_pdf("Hello World")
-        text, vision_used = await extract_text_from_pdf_bytes(pdf_bytes)
-        assert "Hello World" in text
-        assert vision_used is False
+        result = await extract_text_from_pdf_bytes(pdf_bytes)
+        assert isinstance(result, PdfExtractionResult)
+        assert "Hello World" in result.text
+        assert result.vision_used is False
+        assert result.scanned_pages_detected == 0
+        assert result.ocr_applied is False
 
     @pytest.mark.asyncio
     async def test_multi_page_pdf(self):
@@ -127,11 +131,11 @@ async def test_multi_page_pdf(self):
         pdf_bytes = doc.tobytes()
         doc.close()
 
-        text, vision_used = await extract_text_from_pdf_bytes(pdf_bytes)
-        assert "Page 1" in text
-        assert "Page 2" in text
-        assert "Page 3" in text
-        assert vision_used is False
+        result = await extract_text_from_pdf_bytes(pdf_bytes)
+        assert "Page 1" in result.text
+        assert "Page 2" in result.text
+        assert "Page 3" in result.text
+        assert result.vision_used is False
 
     @pytest.mark.asyncio
     async def test_empty_pdf(self):
@@ -140,17 +144,15 @@ async def test_empty_pdf(self):
         pdf_bytes = doc.tobytes()
         doc.close()
 
-        text, vision_used = await extract_text_from_pdf_bytes(pdf_bytes)
-        assert "--- Page 1 ---" in text
+        result = await extract_text_from_pdf_bytes(pdf_bytes)
+        assert "--- Page 1 ---" in result.text
 
     @pytest.mark.asyncio
     async def test_no_vision_without_client(self):
         pdf_bytes = _make_simple_pdf("Digital text only")
-        text, vision_used = await extract_text_from_pdf_bytes(
-            pdf_bytes, vision_client=None
-        )
-        assert "Digital text only" in text
-        assert vision_used is False
+        result = await extract_text_from_pdf_bytes(pdf_bytes, vision_client=None)
+        assert "Digital text only" in result.text
+        assert result.vision_used is False
 
     @pytest.mark.asyncio
     async def test_image_described_with_vision_client(self):
@@ -161,24 +163,20 @@ async def test_image_described_with_vision_client(self):
         mock_client.max_concurrent_pages = 3
         mock_client.describe_image = AsyncMock(return_value="A red square image")
 
-        text, vision_used = await extract_text_from_pdf_bytes(
-            pdf_bytes, vision_client=mock_client
-        )
-        assert long_text in text
-        assert "[Image: A red square image]" in text
-        assert vision_used is True
+        result = await extract_text_from_pdf_bytes(pdf_bytes, vision_client=mock_client)
+        assert long_text in result.text
+        assert "[Image: A red square image]" in result.text
+        assert result.vision_used is True
         mock_client.describe_image.assert_called()
 
     @pytest.mark.asyncio
     async def test_image_skipped_without_vision_client(self):
         long_text = "This document has embedded images but no vision client provided"
         pdf_bytes = _make_pdf_with_image(long_text, image_size=200)
-        text, vision_used = await extract_text_from_pdf_bytes(
-            pdf_bytes, vision_client=None
-        )
-        assert long_text in text
-        assert "[Image:" not in text
-        assert vision_used is False
+        result = await extract_text_from_pdf_bytes(pdf_bytes, vision_client=None)
+        assert long_text in result.text
+        assert "[Image:" not in result.text
+        assert result.vision_used is False
 
     @pytest.mark.asyncio
     async def test_image_skipped_when_process_images_false(self):
@@ -188,11 +186,11 @@ async def test_image_skipped_when_process_images_false(self):
         mock_client = AsyncMock()
         mock_client.max_concurrent_pages = 3
 
-        text, vision_used = await extract_text_from_pdf_bytes(
+        result = await extract_text_from_pdf_bytes(
             pdf_bytes, vision_client=mock_client, process_images=False
         )
-        assert long_text in text
-        assert "[Image:" not in text
+        assert long_text in result.text
+        assert "[Image:" not in result.text
         mock_client.describe_image.assert_not_called()
 
     @pytest.mark.asyncio
@@ -203,10 +201,19 @@ async def test_large_image_uses_ocr(self):
         mock_client.max_concurrent_pages = 3
         mock_client.ocr_image = AsyncMock(return_value="OCR extracted text from scan")
 
-        text, vision_used = await extract_text_from_pdf_bytes(
-            pdf_bytes, vision_client=mock_client
-        )
-        assert "OCR extracted text from scan" in text
-        assert vision_used is True
+        result = await extract_text_from_pdf_bytes(pdf_bytes, vision_client=mock_client)
+        assert "OCR extracted text from scan" in result.text
+        assert result.vision_used is True
+        assert result.scanned_pages_detected == 1
+        assert result.ocr_applied is True
         mock_client.ocr_image.assert_called()
         mock_client.describe_image.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_scanned_page_without_vision_client(self):
+        pdf_bytes = _make_fullpage_image_pdf()
+
+        result = await extract_text_from_pdf_bytes(pdf_bytes, vision_client=None)
+        assert result.scanned_pages_detected == 1
+        assert result.ocr_applied is False
+        assert result.vision_used is False
diff --git a/services/crawler/app/models.py b/services/crawler/app/models.py
@@ -376,6 +376,9 @@ class FileMetadataResponse(BaseModel):
     slide_count: int | None = Field(None, description="Number of slides (PPTX)")
     created_at: int | None = Field(None, description="Document creation date (Unix ms)")
     modified_at: int | None = Field(None, description="Document modification date (Unix ms)")
+    scanned_pages_detected: int | None = Field(
+        None, description="Number of pages detected as scanned (low text content)"
+    )
     error: str | None = Field(None, description="Error message if extraction failed")
 
 

diff --git a/services/crawler/app/routers/docx.py b/services/crawler/app/routers/docx.py
@@ -499,6 +499,7 @@ async def extract_docx_metadata(file: UploadFile = _FILE_UPLOAD):
             author=meta["author"] or None,
             created_at=meta["created_at"],
             modified_at=meta["modified_at"],
+            scanned_pages_detected=0,
         )
     except HTTPException:
         raise

diff --git a/services/crawler/app/routers/pdf.py b/services/crawler/app/routers/pdf.py
@@ -251,6 +251,20 @@ async def extract_pdf_metadata(file: UploadFile = _FILE_UPLOAD):
         doc = fitz.open(stream=file_bytes, filetype="pdf")
         raw = doc.metadata or {}
         page_count = len(doc)
+
+        large_image_ratio = 0.5
+        scanned_count = 0
+        for page_num in range(page_count):
+            page = doc[page_num]
+            page_area = page.rect.get_area()
+            if page_area <= 0:
+                continue
+            for img in page.get_images(full=True):
+                bbox = page.get_image_bbox(img)
+                if bbox and bbox.get_area() / page_area > large_image_ratio:
+                    scanned_count += 1
+                    break
+
         doc.close()
 
         return FileMetadataResponse(
@@ -262,6 +276,7 @@ async def extract_pdf_metadata(file: UploadFile = _FILE_UPLOAD):
             page_count=page_count,
             created_at=_parse_pdf_date(raw.get("creationDate")),
             modified_at=_parse_pdf_date(raw.get("modDate")),
+            scanned_pages_detected=scanned_count,
         )
     except HTTPException:
         raise

diff --git a/services/crawler/app/routers/pptx.py b/services/crawler/app/routers/pptx.py
@@ -301,6 +301,7 @@ async def extract_pptx_metadata(file: UploadFile = _FILE_UPLOAD):
             slide_count=len(prs.slides),
             created_at=meta["created_at"],
             modified_at=meta["modified_at"],
+            scanned_pages_detected=0,
         )
     except HTTPException:
         raise