Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
c54cd12
feat(platform): simplify file tools — auto RAG indexing, remove parse…
larryro Apr 11, 2026
4139717
fix: resolve CI failures — remove unused exports and fix RAG test mock
larryro Apr 11, 2026
9ef9180
feat(platform): track RAG indexing status for chat file attachments
larryro Apr 11, 2026
4507141
fix(platform): extend chat file indexing poll timeout to ~31 minutes
larryro Apr 11, 2026
aae978d
feat(platform): add progress tracking for file indexing
larryro Apr 11, 2026
02f7253
fix(platform): truncate large documents in rag_search retrieve operation
larryro Apr 11, 2026
abefcd4
refactor(platform): paginate rag_search retrieve — default 10 chunks …
larryro Apr 11, 2026
d2474b2
fix(platform): show actual error messages in chat instead of generic …
larryro Apr 11, 2026
1b548a9
Revert "fix(platform): show actual error messages in chat instead of …
larryro Apr 11, 2026
0cd4beb
fix(platform): show actual error messages in chat for failed agent re…
larryro Apr 11, 2026
7efff3d
fix(db): make semantic_cache embedding nullable and defer HNSW index …
larryro Apr 11, 2026
898eb81
fix(platform): resolve message metadata lookup for failed multi-step …
larryro Apr 11, 2026
606f042
fix(platform): show progress as percentage and extend polling to 2.5 …
larryro Apr 11, 2026
eb4abd2
refactor(platform): move RAG status polling to client and enhance sou…
larryro Apr 11, 2026
da68b55
refactor(platform): improve citation handling for file retrieve opera…
larryro Apr 11, 2026
c9da9dd
fix(platform): improve RAG tool instructions for file ID prioritization
larryro Apr 11, 2026
8a6b718
fix: resolve CI typecheck and test failures
larryro Apr 11, 2026
1894343
fix: resolve lint and test failures
larryro Apr 11, 2026
5b3f223
fix: add ragStatus to internal_mutations test expectations
larryro Apr 11, 2026
5d19a6c
feat(crawler): add /from-html and /from-markdown endpoints to PPTX ro…
larryro Apr 11, 2026
378d173
feat(platform): replace PPTX generation with HTML slide presentations
larryro Apr 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""File text extraction modules."""

from .router import extract_text
from .router import ProgressCallback, extract_text

__all__ = ["extract_text"]
__all__ = ["ProgressCallback", "extract_text"]
12 changes: 12 additions & 0 deletions packages/tale_knowledge/src/tale_knowledge/extraction/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from __future__ import annotations

import asyncio
from collections.abc import Callable
from functools import partial
from typing import TYPE_CHECKING

Expand All @@ -19,6 +20,8 @@
if TYPE_CHECKING:
from tale_knowledge.vision.client import VisionClient

ProgressCallback = Callable[[int, int], None] # (pages_done, total_pages)

LARGE_IMAGE_RATIO = 0.5
MAX_PAGES = 2000
DEFAULT_PAGE_CONCURRENCY = 8
Expand Down Expand Up @@ -139,6 +142,7 @@ async def extract_text_from_pdf_bytes(
vision_client: VisionClient | None = None,
process_images: bool = True,
max_pages: int = MAX_PAGES,
on_progress: ProgressCallback | None = None,
) -> tuple[str, bool]:
"""Extract text from PDF bytes.

Expand All @@ -148,6 +152,8 @@ async def extract_text_from_pdf_bytes(
vision_client: Optional VisionClient for OCR/image description.
process_images: Whether to extract and describe embedded images.
max_pages: Maximum number of pages to process.
on_progress: Optional callback ``(pages_done, total_pages)`` invoked
after each page completes. Safe to call from concurrent tasks.

Returns:
Tuple of (extracted_text, vision_was_used).
Expand Down Expand Up @@ -182,7 +188,10 @@ async def extract_text_from_pdf_bytes(
finally:
doc.close()

pages_done = 0

async def process_page(page_num: int, page_bytes: bytes) -> tuple[int, str, bool]:
nonlocal pages_done
async with page_semaphore:
content, vis_used = await _extract_page_with_layout(
page_bytes,
Expand All @@ -191,6 +200,9 @@ async def process_page(page_num: int, page_bytes: bytes) -> tuple[int, str, bool
vision_client,
process_images,
)
pages_done += 1
if on_progress is not None:
on_progress(pages_done, pages_to_process)
return page_num, f"--- Page {page_num + 1} ---\n{content}", vis_used

tasks = [process_page(pn, pb) for pn, pb in page_data]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

from collections.abc import Callable
from pathlib import Path
from typing import TYPE_CHECKING

Expand Down Expand Up @@ -33,12 +34,16 @@ def is_supported(filename: str) -> bool:
return Path(filename).suffix.lower() in ALL_SUPPORTED_EXTENSIONS


ProgressCallback = Callable[[int, int], None]


async def extract_text(
file_bytes: bytes,
filename: str,
*,
vision_client: VisionClient | None = None,
process_images: bool = True,
on_progress: ProgressCallback | None = None,
) -> tuple[str, bool]:
"""Extract text from file bytes, routing to the correct extractor.

Expand All @@ -47,6 +52,8 @@ async def extract_text(
filename: Original filename (used to determine file type).
vision_client: Optional VisionClient for OCR/image description.
process_images: Whether to extract and describe embedded images.
on_progress: Optional callback ``(done, total)`` for page-level progress
(currently only used by PDF extraction).

Returns:
Tuple of (extracted_text, vision_was_used).
Expand All @@ -65,6 +72,7 @@ async def extract_text(
filename,
vision_client=vision_client,
process_images=process_images,
on_progress=on_progress,
)

if suffix in DOCX_EXTENSIONS:
Expand Down
12 changes: 12 additions & 0 deletions services/crawler/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,18 @@ class HtmlToDocxRequest(BaseModel):
# ==================== PPTX Models ====================


class MarkdownToPptxRequest(BaseModel):
"""Request to convert Markdown to PPTX."""

content: str = Field(..., description="Markdown content to convert")


class HtmlToPptxRequest(BaseModel):
"""Request to convert HTML to PPTX."""

html: str = Field(..., description="HTML content to convert")


class TableData(BaseModel):
"""Table data for PPTX generation."""

Expand Down
85 changes: 85 additions & 0 deletions services/crawler/app/routers/pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@
import json

from fastapi import APIRouter, File, Form, HTTPException, UploadFile, status
from fastapi.responses import Response
from loguru import logger

from app.models import (
FileMetadataResponse,
GeneratePptxResponse,
HtmlToPptxRequest,
MarkdownToPptxRequest,
ParseFileResponse,
)
from app.services.file_parser_service import get_file_parser_service
Expand Down Expand Up @@ -133,6 +136,88 @@ async def generate_pptx_from_json(
)


_PPTX_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.presentation"


@router.post("/from-markdown")
async def convert_markdown_to_pptx(request: MarkdownToPptxRequest):
"""
Convert Markdown content to PPTX.

Parses markdown into HTML, then extracts slide structure (headings become
slide titles, lists become bullet points, etc.) and generates a PowerPoint.

Args:
request: Markdown content

Returns:
PPTX file as binary response
"""
try:
from app.services.base_converter import BaseConverterService
from app.services.html_to_pptx_converter import html_to_slides

converter = BaseConverterService()
html = await converter.markdown_to_html(request.content)
slides_content = html_to_slides(html)

template_service = get_template_service()
pptx_bytes = await template_service.generate_pptx_from_content(
slides_content=slides_content,
)

return Response(
content=pptx_bytes,
media_type=_PPTX_CONTENT_TYPE,
headers={"Content-Disposition": "attachment; filename=presentation.pptx"},
)

except Exception:
logger.exception("Error converting markdown to PPTX")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to convert markdown to PPTX",
) from None


@router.post("/from-html")
async def convert_html_to_pptx(request: HtmlToPptxRequest):
"""
Convert HTML content to PPTX.

Parses HTML to extract slide structure (h1/h2 headings become slide titles,
lists become bullet points, tables preserved) and generates a PowerPoint.

Args:
request: HTML content

Returns:
PPTX file as binary response
"""
try:
from app.services.html_to_pptx_converter import html_to_slides

slides_content = html_to_slides(request.html)

template_service = get_template_service()
pptx_bytes = await template_service.generate_pptx_from_content(
slides_content=slides_content,
)

return Response(
content=pptx_bytes,
media_type=_PPTX_CONTENT_TYPE,
headers={"Content-Disposition": "attachment; filename=presentation.pptx"},
)

except Exception:
logger.exception("Error converting HTML to PPTX")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to convert HTML to PPTX",
) from None


@router.post("/parse", response_model=ParseFileResponse)
async def parse_pptx_file(
file: UploadFile = _FILE_UPLOAD,
Expand Down
Loading
Loading