stackitcloud · a-klos · Nov 17, 2025 · Nov 11, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/README.md b/README.md
@@ -51,7 +51,7 @@ Welcome to the STACKIT RAG Template! This is a basic example of how to use the R
 
 ## Features 🚀
 
-**Document Management**: Supports PDFs, DOCX, PPTX, XML, EPUB documents and websource via confluence as well as sitemaps.
+**Document Management**: Supports PDFs, Office docs (DOCX, PPTX), spreadsheets (XLSX), Markdown/AsciiDoc (MD, MDX, ADOC), EPUB/HTML/XML, CSV/TXT, and raster images, with automatic fallbacks between Docling, MarkItDown, and custom extractors; also handles Confluence spaces and sitemaps.
 
 **AI Integration**: Multiple LLM and embedder providers for flexibility.
 
@@ -109,9 +109,9 @@ All components are provided by the *admin-api-lib*. For further information on e
 
 #### 1.1.3 Document extractor
 
-The Document extractor is a component that is used to extract the content from the documents and confluence spaces.
+The Document extractor ingests uploaded files and remote sources (Confluence, sitemap) and now orchestrates multiple extractors with a deterministic fallback chain. Docling runs first for rich formats (PDF, Office, Markdown, HTML, images), MarkItDown provides lightweight markdown conversion, and specialised custom extractors (PDF, MS Office, XML, EPUB, Tesseract OCR) handle edge cases. The order and availability can be customised through the dependency-injector container.
 
-All components are provided by the *extractor-api-lib*. For further information on endpoints and requirements, please consult [the libs README](./libs/README.md#3-extractor-api-lib).
+All components are provided by the *extractor-api-lib*. For further information on endpoints, extractor ordering, supported formats, and configuration tips, please consult [the libs README](./libs/README.md#3-extractor-api-lib).
 
 #### 1.1.4 MCP Server
 

diff --git a/infrastructure/rag/templates/_admin_backend_and_extractor_helpers.tpl b/infrastructure/rag/templates/_admin_backend_and_extractor_helpers.tpl
@@ -52,10 +52,6 @@
 {{- printf "%s-langfuse-configmap" .Release.Name | trunc 63 | trimSuffix "-" -}}
 {{- end -}}
 
-{{- define "configmap.pdfextractorName" -}}
-{{- printf "%s-pdfextractor-configmap" .Release.Name | trunc 63 | trimSuffix "-" -}}
-{{- end -}}
-
 {{- define "configmap.adminBackendName" -}}
 {{- printf "%s-admin-backend-configmap" .Release.Name | trunc 63 | trimSuffix "-" -}}
 {{- end -}}
@@ -81,6 +77,14 @@
 {{- printf "%s:%s" .Values.extractor.image.repository .Values.extractor.image.tag | trimSuffix ":" }}
 {{- end -}}
 
+{{- define "extractor.huggingfaceCacheDir" -}}
+{{- default "/tmp/hf-cache" .Values.extractor.huggingfaceCacheDir -}}
+{{- end -}}
+
+{{- define "extractor.modelscopeCacheDir" -}}
+{{- default "/var/modelscope" .Values.extractor.modelscopeCacheDir -}}
+{{- end -}}
+
 # ingress
 {{- define "ingress.adminBackendFullname" -}}
 {{- printf "%s-admin-backend-ingress" .Release.Name | trunc 63 | trimSuffix "-" -}}

diff --git a/infrastructure/rag/templates/admin-backend/deployment.yaml b/infrastructure/rag/templates/admin-backend/deployment.yaml
@@ -36,12 +36,17 @@ spec:
           - sh
           - -c
           - |
+            set -euo pipefail;
             touch /app/services/admin-backend/log/logfile.log && \
             chmod 600 /app/services/admin-backend/log/logfile.log;
+            mkdir -p /home/nonroot/nltk_data/tokenizers && \
+            mkdir -p /home/nonroot/nltk_data/taggers && \
             wget -q -O /tmp/punkt.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt_tab.zip && \
-            unzip /tmp/punkt.zip -d /home/nonroot/nltk_data/tokenizers && \
+            unzip -oq /tmp/punkt.zip -d /home/nonroot/nltk_data/tokenizers && \
+            rm -f /tmp/punkt.zip && \
             wget -q -O /tmp/averaged_perceptron_tagger_eng.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger_eng.zip && \
-            unzip /tmp/averaged_perceptron_tagger_eng.zip -d /home/nonroot/nltk_data/taggers;
+            unzip -oq /tmp/averaged_perceptron_tagger_eng.zip -d /home/nonroot/nltk_data/taggers && \
+            rm -f /tmp/averaged_perceptron_tagger_eng.zip;
         volumeMounts:
         - name: log-dir
           mountPath: /app/services/admin-backend/log
@@ -108,8 +113,6 @@ spec:
               name: {{ template "configmap.ragapiName" . }}
           - configMapRef:
               name: {{ template "configmap.stackitVllmName" . }}
-          - configMapRef:
-              name: {{ template "configmap.pdfextractorName" . }}
           - configMapRef:
               name: {{ template "configmap.keyValueStoreName" . }}
           - configMapRef:

diff --git a/infrastructure/rag/templates/configmap.yaml b/infrastructure/rag/templates/configmap.yaml
@@ -9,15 +9,6 @@ data:
 ---
 apiVersion: v1
 kind: ConfigMap
-metadata:
-  name: {{ template "configmap.pdfextractorName" . }}
-data:
-  {{- range $key, $value := .Values.shared.envs.pdfextractor }}
-  {{ $key }}: {{ $value | quote }}
-  {{- end }}
----
-apiVersion: v1
-kind: ConfigMap
 metadata:
   name: {{ template "configmap.usecaseName" . }}
 data:

diff --git a/infrastructure/rag/templates/extractor/deployment.yaml b/infrastructure/rag/templates/extractor/deployment.yaml
@@ -25,30 +25,42 @@ spec:
           emptyDir: {}
         - name: nltk-data-dir
           emptyDir: {}
+        - name: modelscope-cache
+          emptyDir: {}
+      {{- $msCacheDir := include "extractor.modelscopeCacheDir" . }}
       {{- if .Values.shared.imagePullSecret }}
       imagePullSecrets:
       - name: {{ .Values.shared.imagePullSecret.name }}
       {{- end }}
       initContainers:
       - name: init-permissions
         image: busybox
+        securityContext:
+          runAsUser: 0
+          runAsGroup: 0
+          runAsNonRoot: false
         command:
           - sh
           - -c
           - |
             touch /app/services/document-extractor/log/logfile.log && \
-            chmod 600 /app/services/document-extractor/log/logfile.log;
+            chmod 600 /app/services/document-extractor/log/logfile.log && \
+            chown -R 10001:10001 /app/services/document-extractor/log && \
             wget -q -O /tmp/punkt.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt_tab.zip && \
-            unzip /tmp/punkt.zip -d /home/nonroot/nltk_data/tokenizers && \
+            unzip -o -q /tmp/punkt.zip -d /home/nonroot/nltk_data/tokenizers && \
             wget -q -O /tmp/averaged_perceptron_tagger_eng.zip https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger_eng.zip && \
-            unzip /tmp/averaged_perceptron_tagger_eng.zip -d /home/nonroot/nltk_data/taggers;
+            unzip -o -q /tmp/averaged_perceptron_tagger_eng.zip -d /home/nonroot/nltk_data/taggers && \
+            mkdir -p /tmp/hf-cache &&  chown -R 10001:10001 /tmp/hf-cache && \
+            mkdir -p {{ $msCacheDir }} && chown -R 10001:10001 {{ $msCacheDir }};
         volumeMounts:
         - name: log-dir
           mountPath: /app/services/document-extractor/log
         - name: nltk-data-dir
           mountPath: /home/nonroot/nltk_data
         - name: tmp-dir
           mountPath: /tmp
+        - name: modelscope-cache
+          mountPath: {{ $msCacheDir }}
       containers:
       - name: {{ .Values.extractor.name }}
         securityContext:
@@ -65,6 +77,8 @@ spec:
           mountPath: /tmp
         - name: nltk-data-dir
           mountPath: /home/nonroot/nltk_data
+        - name: modelscope-cache
+          mountPath: {{ $msCacheDir }}
         image: {{ template "extractor.fullImageName" . }}
         imagePullPolicy: {{ .Values.extractor.image.pullPolicy }}
         {{- if not (empty .Values.extractor.command) }}
@@ -96,12 +110,19 @@ spec:
         envFrom:
           - configMapRef:
               name: {{ template "configmap.s3Name" . }}
-          - configMapRef:
-              name: {{ template "configmap.pdfextractorName" . }}
           - secretRef:
               name: {{ template "secret.s3Name" . }}
+        {{- $hfCacheDir := include "extractor.huggingfaceCacheDir" . }}
         env:
           - name: PYTHONPATH
             value: {{ .Values.extractor.pythonPathEnv.PYTHONPATH }}
           - name: NLTK_DATA
             value: /home/nonroot/nltk_data
+          - name: HF_HOME
+            value: {{ $hfCacheDir | quote }}
+          - name: HUGGINGFACE_HUB_CACHE
+            value: {{ $hfCacheDir | quote }}
+          - name: MODELSCOPE_HOME
+            value: {{ $msCacheDir | quote }}
+          - name: XDG_CACHE_HOME
+            value: {{ $msCacheDir | quote }}
diff --git a/infrastructure/rag/values.yaml b/infrastructure/rag/values.yaml
@@ -401,6 +401,9 @@ extractor:
 
   pythonPathEnv:
     PYTHONPATH: src
+  huggingfaceCacheDir: /tmp/hf-cache
+  # Directory inside the container to use as writable cache for ModelScope / OCR models
+  modelscopeCacheDir: /var/modelscope
 
 adminFrontend:
   name: admin-frontend
@@ -464,9 +467,6 @@ shared:
 
 
   envs:
-    pdfExtractor:
-      PDF_EXTRACTOR_DIAGRAMS_FOLDER_NAME: "connection_diagrams"
-      PDF_EXTRACTOR_FOOTER_HEIGHT: 155
     s3:
       S3_ENDPOINT: http://rag-minio:9000
       S3_BUCKET: documents

diff --git a/libs/extractor-api-lib/README.md b/libs/extractor-api-lib/README.md
@@ -10,11 +10,89 @@ Content ingestion layer for the STACKIT RAG template. This library exposes a Fas
 
 ## Feature highlights
 
-- **Broad format coverage** – PDFs, DOCX, PPTX, XML/EPUB, Confluence spaces, and sitemap-driven websites.
+- **Layered extraction pipeline** – Docling, MarkItDown, and the custom extractors now cooperate with a deterministic fallback chain, so a failed run automatically cascades to the next extractor.
+- **Expanded format coverage** – PDFs, Office documents, EPUB, XML, Markdown/AsciiDoc, CSV/TXT, raster images, Confluence spaces, and sitemap-driven websites.
 - **Consistent output schema** – Information pieces are returned in a unified structure with content type (`TEXT`, `TABLE`, `IMAGE`) and metadata.
 - **Swappable extractors** – Dependency-injector container makes it easy to add or replace file/source extractors, table converters, etc.
 - **Production-grade plumbing** – Built-in S3-compatible file service, LangChain loaders with retry/backoff, optional PDF OCR, and throttling controls for web crawls.
 
+## File extractor pipeline
+
+[`GeneralFileExtractor`](src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py) orchestrates file parsing. It resolves the file type from the extension, filters the extractors that declare matching `compatible_file_types`, reverses that filtered list, and then executes the extractors in sequence until one returns content or all have failed. Exceptions are logged and the next extractor takes over; only if every extractor either returns no content or raises an exception do we bubble up an error.
+
+### Default execution order
+
+The dependency container wires extractors in the following list:
+
+1. `DoclingFileExtractor`
+2. `MarkitdownFileExtractor`
+3. `PDFExtractor`
+4. `EpubExtractor`
+5. `XMLExtractor`
+6. `MSDocsExtractor`
+7. `TesseractImageExtractor`
+
+Because the orchestrator reverses the candidate list before the fallback loop, the priority for overlapping formats is the reverse of this wiring. For example, PDFs run through Docling first, then fall back to MarkItDown, and finally to the custom PDF extractor; DOCX/PPTX files follow Docling → MarkItDown → MSDocs; raster images go through Docling’s OCR pipeline before falling back to the Tesseract-only extractor.
+
+### Supported formats
+
+| Format family            | Extensions                                               | Primary extractor          | Fallbacks                                                | Notes |
+|--------------------------|----------------------------------------------------------|----------------------------|----------------------------------------------------------|-------|
+| PDF                      | `.pdf`                                                   | Docling                    | MarkItDown → Custom PDF extractor                        | Docling performs OCR + table extraction; the PDF extractor keeps Camelot/pdfplumber heuristics as a last resort. |
+| Microsoft Word           | `.docx`                                                  | Docling                    | MarkItDown → MSDocs                                      | MSDocs keeps unstructured-based table conversion for custom cases. |
+| Microsoft PowerPoint     | `.pptx`                                                  | Docling                    | MarkItDown → MSDocs                                      | MarkItDown splits slides by `<!-- Slide number: N -->`. |
+| Microsoft Excel          | `.xlsx`                                                  | Docling                    | —                                                        | Tables returned as markdown; Docling infers sheet structure. |
+| EPUB                     | `.epub`                                                  | MarkItDown                 | EPUB extractor                                           | MarkItDown covers simple ebooks; the LangChain-based EPUB extractor preserves metadata when MarkItDown fails. |
+| HTML                     | `.html`                                                  | Docling                    | MarkItDown                                               | Docling keeps DOM-aware segmentation; MarkItDown is lighter-weight. |
+| Markdown                 | `.md`, `.markdown`, `.mdx`                               | Docling                    | —                                                        | MarkItDown does not currently register for Markdown. |
+| AsciiDoc                 | `.adoc`, `.asciidoc`                                     | Docling                    | —                                                        | |
+| CSV                      | `.csv`                                                   | Docling                    | MarkItDown                                               | Both produce markdown tables; Docling preserves structured metadata. |
+| Plain text               | `.txt`                                                   | MarkItDown                 | —                                                        | |
+| XML                      | `.xml`                                                   | XML extractor              | —                                                        | Uses the unstructured XML partitioner. |
+| Raster images            | `.jpg`, `.jpeg`, `.png`, `.tiff`, `.tif`, `.bmp`          | Docling (OCR)              | Tesseract image extractor                                | Docling feeds Tesseract CLI OCR; the fallback enforces single-frame images via Pillow. |
+
+Image coverage currently excludes animated GIF, WebP, HEIC, and SVG files. These extensions are ignored by the routing logic and will surface as “No extractor found” errors until an extractor declares support.
+
+### Source extractor pipeline
+
+`GeneralSourceExtractor` wires Confluence and sitemap loaders behind a similar abstraction. Unlike files, source extractors are keyed by `ExtractionParameters.source_type` and the matching extractor is called directly (no fallback chain).
+
+## Configuring extractor order
+
+The order lives in `DependencyContainer.file_extractors`. You can override it either by subclassing the container or by overriding the provider at runtime before wiring the FastAPI app. Example:
+
+`container.py`
+
+```python
+from dependency_injector.providers import List
+
+from extractor_api_lib.dependency_container import DependencyContainer
+
+
+class CustomExtractorContainer(DependencyContainer):
+    file_extractors = List(
+        DependencyContainer.docling_extractor,
+        DependencyContainer.markitdown_extractor,
+        DependencyContainer.ms_docs_extractor,
+        DependencyContainer.pdf_extractor,
+        DependencyContainer.image_extractor,
+        DependencyContainer.xml_extractor,
+        DependencyContainer.epub_extractor,
+    )
+```
+
+`main.py`
+
+```python
+from extractor_api_lib.main import app as perfect_extractor_app, register_dependency_container
+
+from container import CustomExtractorContainer
+
+register_dependency_container(CustomExtractorContainer())
+```
+
+The last provider in the list becomes the first extractor tried for a matching file type. Keep shared singleton providers (file service, converters) in the parent class to avoid double instantiation.
+
 ## Installation
 
 ```bash
@@ -45,11 +123,11 @@ Both endpoints stream their results back to `admin-api-lib`, which takes care of
 
 ## How the file extraction endpoint works
 
-1. Download the file from S3
-2. Chose suitable file extractor based on the filename ending
-3. Extract the content from the file
-4. Map the internal representation to the external schema
-5. Return the final output
+1. Download the file from S3.
+2. Derive the file type from the extension (normalizing common image/Markdown/AsciiDoc aliases).
+3. Select extractors that declare support for the resolved `FileType`.
+4. Run the extractors in priority order (highest priority first); stop at the first non-empty result or keep falling back if an extractor raises.
+5. Map the internal representation to the external schema and return the final output.
 
 ## How the source extraction endpoint works
 
@@ -64,7 +142,6 @@ Both endpoints stream their results back to `admin-api-lib`, which takes care of
 Two `pydantic-settings` models ship with this package:
 
 - **S3 storage** (`S3Settings`) – configure the built-in file service with `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, `S3_ENDPOINT`, and `S3_BUCKET`.
-- **PDF extraction** (`PDFExtractorSettings`) – adjust footer trimming or diagram export via `PDF_EXTRACTOR_FOOTER_HEIGHT` and `PDF_EXTRACTOR_DIAGRAMS_FOLDER_NAME`.
 
 Other extractors accept their parameters at runtime through the request payload (`ExtractionParameters`). For example, the admin backend forwards Confluence credentials, sitemap URLs, or custom headers when it calls `/extract_from_source`. This keeps the library stateless and makes it easy to plug in additional sources without redeploying.
 
@@ -80,10 +157,19 @@ from extractor_api_lib.main import app as perfect_extractor_app
 
 ## Extending the library
 
-1. Implement `InformationFileExtractor` or `InformationExtractor` for your new format/source.
-2. Register it in `dependency_container.py` (append to `file_extractors` list or `source_extractors` dict).
-3. Update mapper or metadata handling if additional fields are required.
-4. Add unit tests under `libs/extractor-api-lib/tests` using fixtures and fake storage providers.
+1. Implement `InformationFileExtractor` (for file-based inputs) or `InformationExtractor` (for remote sources).
+2. Add a provider to `DependencyContainer` (usually a `Singleton`) and wire dependencies such as the shared `FileService` or table converter.
+3. Append the provider to `file_extractors` (or to the source extractor list) in the desired position so that the fallback order is correct.
+4. Update mappers or metadata handling if additional fields are required.
+5. Cover the happy path and a failure edge case with tests under `libs/extractor-api-lib/tests`, mocking external services (OCR, network, file I/O).
+
+## Advantages and caveats
+
+- Docling-first prioritisation dramatically improves structured extraction (tables, headings) and adds OCR to formats that previously lacked it.
+- Retaining MarkItDown and the custom PDF/MS extractors provides graceful degradation when Docling fails or produces empty output.
+- Image support now goes through Docling’s OCR before falling back to pure Tesseract.
+- The configuration still requires code changes; there is no environment-variable switch to reshuffle or disable extractors at runtime.
+- Multi-frame images, animated/novel image formats, and office formats such as ODT/RTF remain unsupported.
 
 ## Contributing