Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ async def extract_text_from_image_bytes(
Tuple of (extracted_text, vision_was_used).
"""
if not vision_client:
logger.warning(f"No vision client provided for image extraction: {filename}")
logger.warning(
f"No vision client configured for image extraction: {filename}. "
f"Image OCR requires a vision model in provider settings."
)
return "", False

logger.info(f"Processing image: {filename}")
Expand Down
77 changes: 67 additions & 10 deletions packages/tale_knowledge/src/tale_knowledge/extraction/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,20 @@
Hybrid approach:
1. Digital PDFs: Extract text directly using PyMuPDF (no API calls)
2. Embedded images: Large images (>50% page area) are OCR'd, smaller ones described

Scanned page detection:
- Pages with < SCANNED_PAGE_TEXT_THRESHOLD characters of digital text are
considered "scanned" when they also contain at least one embedded image
covering > LARGE_IMAGE_RATIO of the page area.
- The extraction result includes `scanned_pages_detected` and `ocr_applied`
so callers can inform the user about OCR activity.
"""

from __future__ import annotations

import asyncio
from collections.abc import Callable
from dataclasses import dataclass
from functools import partial
from typing import TYPE_CHECKING

Expand All @@ -23,10 +31,21 @@
ProgressCallback = Callable[[int, int], None] # (pages_done, total_pages)

LARGE_IMAGE_RATIO = 0.5
SCANNED_PAGE_TEXT_THRESHOLD = 50
MAX_PAGES = 2000
DEFAULT_PAGE_CONCURRENCY = 8


@dataclass(frozen=True, slots=True)
class PdfExtractionResult:
"""Result of a PDF text extraction."""

text: str
vision_used: bool
scanned_pages_detected: int
ocr_applied: bool


def _extract_page_text_sync(page_bytes: bytes) -> dict:
"""Extract text and image blocks from a single page (runs in thread pool).

Expand Down Expand Up @@ -95,11 +114,14 @@ async def _extract_page_with_layout(
vision_semaphore: asyncio.Semaphore,
vision_client: VisionClient | None,
process_images: bool,
) -> tuple[str, bool]:
) -> tuple[str, bool, bool]:
"""Extract page content preserving text and image positions.

Images covering >50% of the page area are OCR'd (likely scanned pages),
smaller images are described.

Returns:
Tuple of (content, vision_used, is_scanned_page).
"""
loop = asyncio.get_running_loop()

Expand All @@ -108,8 +130,18 @@ async def _extract_page_with_layout(
)
elements: list[tuple[float, str]] = text_data["elements"]
images: list[tuple[float, bytes, float]] = text_data["images"]
total_text_len: int = text_data["total_text_len"]
vision_used = False

has_large_image = any(area_ratio > LARGE_IMAGE_RATIO for _, _, area_ratio in images)
is_scanned_page = total_text_len < SCANNED_PAGE_TEXT_THRESHOLD and has_large_image

if is_scanned_page and not vision_client:
logger.warning(
f"Page {page_num + 1}: Scanned page detected ({total_text_len} chars, "
f"large image present) but no vision client configured — OCR skipped"
)

if process_images and vision_client and images:
for y0, img_bytes, area_ratio in images:
try:
Expand All @@ -132,7 +164,7 @@ async def _extract_page_with_layout(

elements.sort(key=lambda x: x[0])
content = "\n\n".join(elem[1] for elem in elements)
return content, vision_used
return content, vision_used, is_scanned_page


async def extract_text_from_pdf_bytes(
Expand All @@ -143,7 +175,7 @@ async def extract_text_from_pdf_bytes(
process_images: bool = True,
max_pages: int = MAX_PAGES,
on_progress: ProgressCallback | None = None,
) -> tuple[str, bool]:
) -> PdfExtractionResult:
"""Extract text from PDF bytes.

Args:
Expand All @@ -156,7 +188,7 @@ async def extract_text_from_pdf_bytes(
after each page completes. Safe to call from concurrent tasks.

Returns:
Tuple of (extracted_text, vision_was_used).
PdfExtractionResult with text, vision_used, scanned_pages_detected, and ocr_applied.
"""
logger.info(f"Processing PDF: {filename}")

Expand Down Expand Up @@ -190,10 +222,12 @@ async def extract_text_from_pdf_bytes(

pages_done = 0

async def process_page(page_num: int, page_bytes: bytes) -> tuple[int, str, bool]:
async def process_page(
page_num: int, page_bytes: bytes
) -> tuple[int, str, bool, bool]:
nonlocal pages_done
async with page_semaphore:
content, vis_used = await _extract_page_with_layout(
content, vis_used, is_scanned = await _extract_page_with_layout(
page_bytes,
page_num,
vision_semaphore,
Expand All @@ -203,29 +237,52 @@ async def process_page(page_num: int, page_bytes: bytes) -> tuple[int, str, bool
pages_done += 1
if on_progress is not None:
on_progress(pages_done, pages_to_process)
return page_num, f"--- Page {page_num + 1} ---\n{content}", vis_used
return (
page_num,
f"--- Page {page_num + 1} ---\n{content}",
vis_used,
is_scanned,
)

tasks = [process_page(pn, pb) for pn, pb in page_data]
results = await asyncio.gather(*tasks, return_exceptions=True)

pages_content: list[tuple[int, str]] = []
vision_used = False
scanned_pages_detected = 0
ocr_applied = False

for result in results:
if isinstance(result, Exception):
logger.warning(f"Page processing failed: {result}")
continue
page_num, content, page_vision_used = result
page_num, content, page_vision_used, page_is_scanned = result
pages_content.append((page_num, content))
if page_vision_used:
vision_used = True
if page_is_scanned:
scanned_pages_detected += 1
if page_vision_used:
ocr_applied = True

pages_content.sort(key=lambda x: x[0])
combined_text = "\n\n".join(p[1] for p in pages_content)

if scanned_pages_detected > 0 and not vision_client:
logger.warning(
f"PDF '{filename}': {scanned_pages_detected} scanned page(s) detected "
f"but no vision client configured — text may be incomplete"
)

logger.info(
f"PDF processing complete: {pages_to_process} pages, {len(combined_text)} chars, "
f"Vision API used: {vision_used}"
f"Vision API used: {vision_used}, scanned pages: {scanned_pages_detected}, "
f"OCR applied: {ocr_applied}"
)

return combined_text, vision_used
return PdfExtractionResult(
text=combined_text,
vision_used=vision_used,
scanned_pages_detected=scanned_pages_detected,
ocr_applied=ocr_applied,
)
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,14 @@ async def extract_text(
if suffix in PDF_EXTENSIONS:
from .pdf import extract_text_from_pdf_bytes

return await extract_text_from_pdf_bytes(
result = await extract_text_from_pdf_bytes(
file_bytes,
filename,
vision_client=vision_client,
process_images=process_images,
on_progress=on_progress,
)
return result.text, result.vision_used

if suffix in DOCX_EXTENSIONS:
from .docx import extract_text_from_docx_bytes
Expand Down
77 changes: 42 additions & 35 deletions packages/tale_knowledge/tests/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from tale_knowledge.extraction.pdf import (
LARGE_IMAGE_RATIO,
PdfExtractionResult,
_extract_page_text_sync,
extract_text_from_pdf_bytes,
)
Expand Down Expand Up @@ -114,9 +115,12 @@ class TestExtractTextFromPdfBytes:
@pytest.mark.asyncio
async def test_digital_pdf_extraction(self):
pdf_bytes = _make_simple_pdf("Hello World")
text, vision_used = await extract_text_from_pdf_bytes(pdf_bytes)
assert "Hello World" in text
assert vision_used is False
result = await extract_text_from_pdf_bytes(pdf_bytes)
assert isinstance(result, PdfExtractionResult)
assert "Hello World" in result.text
assert result.vision_used is False
assert result.scanned_pages_detected == 0
assert result.ocr_applied is False

@pytest.mark.asyncio
async def test_multi_page_pdf(self):
Expand All @@ -127,11 +131,11 @@ async def test_multi_page_pdf(self):
pdf_bytes = doc.tobytes()
doc.close()

text, vision_used = await extract_text_from_pdf_bytes(pdf_bytes)
assert "Page 1" in text
assert "Page 2" in text
assert "Page 3" in text
assert vision_used is False
result = await extract_text_from_pdf_bytes(pdf_bytes)
assert "Page 1" in result.text
assert "Page 2" in result.text
assert "Page 3" in result.text
assert result.vision_used is False

@pytest.mark.asyncio
async def test_empty_pdf(self):
Expand All @@ -140,17 +144,15 @@ async def test_empty_pdf(self):
pdf_bytes = doc.tobytes()
doc.close()

text, vision_used = await extract_text_from_pdf_bytes(pdf_bytes)
assert "--- Page 1 ---" in text
result = await extract_text_from_pdf_bytes(pdf_bytes)
assert "--- Page 1 ---" in result.text

@pytest.mark.asyncio
async def test_no_vision_without_client(self):
pdf_bytes = _make_simple_pdf("Digital text only")
text, vision_used = await extract_text_from_pdf_bytes(
pdf_bytes, vision_client=None
)
assert "Digital text only" in text
assert vision_used is False
result = await extract_text_from_pdf_bytes(pdf_bytes, vision_client=None)
assert "Digital text only" in result.text
assert result.vision_used is False

@pytest.mark.asyncio
async def test_image_described_with_vision_client(self):
Expand All @@ -161,24 +163,20 @@ async def test_image_described_with_vision_client(self):
mock_client.max_concurrent_pages = 3
mock_client.describe_image = AsyncMock(return_value="A red square image")

text, vision_used = await extract_text_from_pdf_bytes(
pdf_bytes, vision_client=mock_client
)
assert long_text in text
assert "[Image: A red square image]" in text
assert vision_used is True
result = await extract_text_from_pdf_bytes(pdf_bytes, vision_client=mock_client)
assert long_text in result.text
assert "[Image: A red square image]" in result.text
assert result.vision_used is True
mock_client.describe_image.assert_called()

@pytest.mark.asyncio
async def test_image_skipped_without_vision_client(self):
long_text = "This document has embedded images but no vision client provided"
pdf_bytes = _make_pdf_with_image(long_text, image_size=200)
text, vision_used = await extract_text_from_pdf_bytes(
pdf_bytes, vision_client=None
)
assert long_text in text
assert "[Image:" not in text
assert vision_used is False
result = await extract_text_from_pdf_bytes(pdf_bytes, vision_client=None)
assert long_text in result.text
assert "[Image:" not in result.text
assert result.vision_used is False

@pytest.mark.asyncio
async def test_image_skipped_when_process_images_false(self):
Expand All @@ -188,11 +186,11 @@ async def test_image_skipped_when_process_images_false(self):
mock_client = AsyncMock()
mock_client.max_concurrent_pages = 3

text, vision_used = await extract_text_from_pdf_bytes(
result = await extract_text_from_pdf_bytes(
pdf_bytes, vision_client=mock_client, process_images=False
)
assert long_text in text
assert "[Image:" not in text
assert long_text in result.text
assert "[Image:" not in result.text
mock_client.describe_image.assert_not_called()

@pytest.mark.asyncio
Expand All @@ -203,10 +201,19 @@ async def test_large_image_uses_ocr(self):
mock_client.max_concurrent_pages = 3
mock_client.ocr_image = AsyncMock(return_value="OCR extracted text from scan")

text, vision_used = await extract_text_from_pdf_bytes(
pdf_bytes, vision_client=mock_client
)
assert "OCR extracted text from scan" in text
assert vision_used is True
result = await extract_text_from_pdf_bytes(pdf_bytes, vision_client=mock_client)
assert "OCR extracted text from scan" in result.text
assert result.vision_used is True
assert result.scanned_pages_detected == 1
assert result.ocr_applied is True
mock_client.ocr_image.assert_called()
mock_client.describe_image.assert_not_called()

@pytest.mark.asyncio
async def test_scanned_page_without_vision_client(self):
pdf_bytes = _make_fullpage_image_pdf()

result = await extract_text_from_pdf_bytes(pdf_bytes, vision_client=None)
assert result.scanned_pages_detected == 1
assert result.ocr_applied is False
assert result.vision_used is False
Comment on lines +212 to +219

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Assert the warning path in this regression test.

This test proves the counters, but the new behavior also relies on emitting a warning when a scanned page is detected without a vision client. Add a log assertion here so that path is covered in CI instead of staying manual-only.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@packages/tale_knowledge/tests/test_pdf.py` around lines 212 - 219, The test
test_scanned_page_without_vision_client should also assert the warning is
emitted when a scanned page is detected without a vision client: wrap the call
to extract_text_from_pdf_bytes(vision_client=None) in a logging/warning capture
(e.g., use pytest's caplog or pytest.warns) and assert that the expected warning
message (the one produced by the scanned-page-without-vision-client code path)
appears; update the test to reference extract_text_from_pdf_bytes and assert the
warning text and/or warning category so CI verifies the warning path.

3 changes: 3 additions & 0 deletions services/crawler/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,9 @@ class FileMetadataResponse(BaseModel):
slide_count: int | None = Field(None, description="Number of slides (PPTX)")
created_at: int | None = Field(None, description="Document creation date (Unix ms)")
modified_at: int | None = Field(None, description="Document modification date (Unix ms)")
scanned_pages_detected: int | None = Field(
None, description="Number of pages detected as scanned (low text content)"
)
error: str | None = Field(None, description="Error message if extraction failed")


Expand Down
1 change: 1 addition & 0 deletions services/crawler/app/routers/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,7 @@ async def extract_docx_metadata(file: UploadFile = _FILE_UPLOAD):
author=meta["author"] or None,
created_at=meta["created_at"],
modified_at=meta["modified_at"],
scanned_pages_detected=0,
)
except HTTPException:
raise
Expand Down
15 changes: 15 additions & 0 deletions services/crawler/app/routers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,20 @@ async def extract_pdf_metadata(file: UploadFile = _FILE_UPLOAD):
doc = fitz.open(stream=file_bytes, filetype="pdf")
raw = doc.metadata or {}
page_count = len(doc)

large_image_ratio = 0.5
scanned_count = 0
for page_num in range(page_count):
page = doc[page_num]
page_area = page.rect.get_area()
if page_area <= 0:
continue
for img in page.get_images(full=True):
bbox = page.get_image_bbox(img)
if bbox and bbox.get_area() / page_area > large_image_ratio:
scanned_count += 1
break

Comment on lines +255 to +267

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Use the same scanned-page heuristic as the extraction pipeline.

This endpoint only applies the large-image-area check, but the PR’s extraction path also uses low-text detection. That means extract-metadata can return scanned_pages_detected = 0 for PDFs that extract_text_from_pdf_bytes will later treat as scanned and OCR, so the UI’s OCR indicator/visionRequired flag can drift from actual extraction behavior. Please reuse the shared detector or route this through the same metadata helper.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@services/crawler/app/routers/pdf.py` around lines 255 - 267, The current loop
only uses the large_image_ratio heuristic (variables page.get_images, bbox,
large_image_ratio, scanned_count) to mark scanned pages, which differs from the
extraction pipeline (e.g., extract_text_from_pdf_bytes) that also uses low-text
detection and sets scanned_pages_detected; replace or augment this logic to call
the shared scanned-page detector or the existing metadata helper used by
extract_text_from_pdf_bytes so both paths use the same detection logic and value
for scanned_pages_detected, i.e., invoke the common function instead of only
counting large-image pages (or combine both heuristics via that shared helper)
so the UI/visionRequired flag stays consistent.

doc.close()

return FileMetadataResponse(
Expand All @@ -262,6 +276,7 @@ async def extract_pdf_metadata(file: UploadFile = _FILE_UPLOAD):
page_count=page_count,
created_at=_parse_pdf_date(raw.get("creationDate")),
modified_at=_parse_pdf_date(raw.get("modDate")),
scanned_pages_detected=scanned_count,
)
except HTTPException:
raise
Expand Down
1 change: 1 addition & 0 deletions services/crawler/app/routers/pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ async def extract_pptx_metadata(file: UploadFile = _FILE_UPLOAD):
slide_count=len(prs.slides),
created_at=meta["created_at"],
modified_at=meta["modified_at"],
scanned_pages_detected=0,
)
except HTTPException:
raise
Expand Down
Loading
Loading