ProjectTech4DevAI · AkhileshNegi · Jan 28, 2026 · Dec 9, 2025 · Dec 9, 2025 · Dec 9, 2025
diff --git a/backend/app/alembic/versions/041_add_config_in_evals_run_table.py b/backend/app/alembic/versions/041_add_config_in_evals_run_table.py
@@ -0,0 +1,59 @@
+"""add config in evals run table
+
+Revision ID: 041
+Revises: 040
+Create Date: 2025-12-15 14:03:22.082746
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "041"
+down_revision = "040"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.add_column(
+        "evaluation_run",
+        sa.Column(
+            "config_id",
+            sa.Uuid(),
+            nullable=True,
+            comment="Reference to the stored config used",
+        ),
+    )
+    op.add_column(
+        "evaluation_run",
+        sa.Column(
+            "config_version",
+            sa.Integer(),
+            nullable=True,
+            comment="Version of the config used",
+        ),
+    )
+    op.create_foreign_key(
+        "fk_evaluation_run_config_id", "evaluation_run", "config", ["config_id"], ["id"]
+    )
+    op.drop_column("evaluation_run", "config")
+
+
+def downgrade():
+    op.add_column(
+        "evaluation_run",
+        sa.Column(
+            "config",
+            postgresql.JSONB(astext_type=sa.Text()),
+            autoincrement=False,
+            nullable=False,
+            comment="Evaluation configuration (model, instructions, etc.)",
+        ),
+    )
+    op.drop_constraint(
+        "fk_evaluation_run_config_id", "evaluation_run", type_="foreignkey"
+    )
+    op.drop_column("evaluation_run", "config_version")
+    op.drop_column("evaluation_run", "config_id")
diff --git a/backend/app/api/routes/evaluations/dataset.py b/backend/app/api/routes/evaluations/dataset.py
@@ -48,7 +48,7 @@ def _dataset_to_response(dataset: EvaluationDataset) -> DatasetUploadResponse:
 
 
 @router.post(
-    "/",
+    "",
     description=load_description("evaluation/upload_dataset.md"),
     response_model=APIResponse[DatasetUploadResponse],
     dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
@@ -87,7 +87,7 @@ async def upload_dataset(
 
 
 @router.get(
-    "/",
+    "",
     description=load_description("evaluation/list_datasets.md"),
     response_model=APIResponse[list[DatasetUploadResponse]],
     dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],

diff --git a/backend/app/api/routes/evaluations/evaluation.py b/backend/app/api/routes/evaluations/evaluation.py
@@ -1,6 +1,7 @@
 """Evaluation run API routes."""
 
 import logging
+from uuid import UUID
 
 from fastapi import (
     APIRouter,
@@ -29,7 +30,7 @@
 
 
 @router.post(
-    "/",
+    "",
     description=load_description("evaluation/create_evaluation.md"),
     response_model=APIResponse[EvaluationRunPublic],
     dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
@@ -41,19 +42,16 @@ def evaluate(
     experiment_name: str = Body(
         ..., description="Name for this evaluation experiment/run"
     ),
-    config: dict = Body(default_factory=dict, description="Evaluation configuration"),
-    assistant_id: str
-    | None = Body(
-        None, description="Optional assistant ID to fetch configuration from"
-    ),
+    config_id: UUID = Body(..., description="Stored config ID"),
+    config_version: int = Body(..., ge=1, description="Stored config version"),
 ) -> APIResponse[EvaluationRunPublic]:
     """Start an evaluation run."""
     eval_run = start_evaluation(
         session=_session,
         dataset_id=dataset_id,
         experiment_name=experiment_name,
-        config=config,
-        assistant_id=assistant_id,
+        config_id=config_id,
+        config_version=config_version,
         organization_id=auth_context.organization_.id,
         project_id=auth_context.project_.id,
     )
@@ -68,7 +66,7 @@ def evaluate(
 
 
 @router.get(
-    "/",
+    "",
     description=load_description("evaluation/list_evaluations.md"),
     response_model=APIResponse[list[EvaluationRunPublic]],
     dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],

diff --git a/backend/app/crud/evaluations/__init__.py b/backend/app/crud/evaluations/__init__.py
@@ -5,6 +5,8 @@
     create_evaluation_run,
     get_evaluation_run_by_id,
     list_evaluation_runs,
+    resolve_evaluation_config,
+    resolve_model_from_config,
     save_score,
 )
 from app.crud.evaluations.cron import (
@@ -43,3 +45,45 @@
     TraceData,
     TraceScore,
 )
+
+__all__ = [
+    # Core
+    "create_evaluation_run",
+    "get_evaluation_run_by_id",
+    "list_evaluation_runs",
+    "resolve_evaluation_config",
+    "resolve_model_from_config",
+    "save_score",
+    # Cron
+    "process_all_pending_evaluations",
+    "process_all_pending_evaluations_sync",
+    # Dataset
+    "create_evaluation_dataset",
+    "delete_dataset",
+    "get_dataset_by_id",
+    "list_datasets",
+    "upload_csv_to_object_store",
+    # Batch
+    "start_evaluation_batch",
+    # Processing
+    "check_and_process_evaluation",
+    "poll_all_pending_evaluations",
+    "process_completed_embedding_batch",
+    "process_completed_evaluation",
+    # Embeddings
+    "calculate_average_similarity",
+    "calculate_cosine_similarity",
+    "start_embedding_batch",
+    # Langfuse
+    "create_langfuse_dataset_run",
+    "fetch_trace_scores_from_langfuse",
+    "update_traces_with_cosine_scores",
+    "upload_dataset_to_langfuse",
+    # Score types
+    "CategoricalSummaryScore",
+    "EvaluationScore",
+    "NumericSummaryScore",
+    "SummaryScore",
+    "TraceData",
+    "TraceScore",
+]
diff --git a/backend/app/crud/evaluations/batch.py b/backend/app/crud/evaluations/batch.py
@@ -16,6 +16,7 @@
 
 from app.core.batch import OpenAIBatchProvider, start_batch_job
 from app.models import EvaluationRun
+from app.models.llm.request import KaapiLLMParams
 
 logger = logging.getLogger(__name__)
 
@@ -59,7 +60,7 @@ def fetch_dataset_items(langfuse: Langfuse, dataset_name: str) -> list[dict[str,
 
 
 def build_evaluation_jsonl(
-    dataset_items: list[dict[str, Any]], config: dict[str, Any]
+    dataset_items: list[dict[str, Any]], config: KaapiLLMParams
 ) -> list[dict[str, Any]]:
     """
     Build JSONL data for evaluation batch using OpenAI Responses API.
@@ -88,7 +89,6 @@ def build_evaluation_jsonl(
         List of dictionaries (JSONL data)
     """
     jsonl_data = []
-
     for item in dataset_items:
         # Extract question from input
         question = item["input"].get("question", "")
@@ -100,14 +100,34 @@ def build_evaluation_jsonl(
 
         # Build the batch request object for Responses API
         # Use config as-is and only add the input field
+        body: dict[str, Any] = {
+            "model": config.model,
+            "instructions": config.instructions,
+            "temperature": config.temperature
+            if config.temperature is not None
+            else 0.01,
+            "input": question,  # Add input from dataset
+        }
+
+        # Add reasoning only if provided
+        if config.reasoning:
+            body["reasoning"] = {"effort": config.reasoning}
+
+        # Add tools only if knowledge_base_ids are provided
+        if config.knowledge_base_ids:
+            body["tools"] = [
+                {
+                    "type": "file_search",
+                    "vector_store_ids": config.knowledge_base_ids,
+                    "max_num_results": config.max_num_results or 20,
+                }
+            ]
+
         batch_request = {
             "custom_id": item["id"],
             "method": "POST",
             "url": "/v1/responses",
-            "body": {
-                **config,  # Use config as-is
-                "input": question,  # Add input from dataset
-            },
+            "body": body,
         }
 
         jsonl_data.append(batch_request)
@@ -119,7 +139,7 @@ def start_evaluation_batch(
     openai_client: OpenAI,
     session: Session,
     eval_run: EvaluationRun,
-    config: dict[str, Any],
+    config: KaapiLLMParams,
 ) -> EvaluationRun:
     """
     Fetch data, build JSONL, and start evaluation batch.
@@ -132,7 +152,7 @@ def start_evaluation_batch(
         openai_client: Configured OpenAI client
         session: Database session
         eval_run: EvaluationRun database object (with run_name, dataset_name, config)
-        config: Evaluation configuration dict with llm, instructions, vector_store_ids
+        config: KaapiLLMParams with model, instructions, knowledge_base_ids, etc.
 
     Returns:
         Updated EvaluationRun with batch_job_id populated
@@ -166,7 +186,7 @@ def start_evaluation_batch(
             "description": f"Evaluation: {eval_run.run_name}",
             "completion_window": "24h",
             # Store complete config for reference
-            "evaluation_config": config,
+            "evaluation_config": config.model_dump(exclude_none=True),
         }
 
         # Step 5: Start batch job using generic infrastructure