Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
3f8ddcf
Refactor evaluation endpoint to use stored configuration and remove a…
avirajsingh7 Dec 9, 2025
5280622
Refactor evaluation run to use config ID and version instead of confi…
avirajsingh7 Dec 9, 2025
13eb778
Add config_id, config_version, and model fields to evaluation run table
avirajsingh7 Dec 9, 2025
7bdd322
Refactor batch evaluation tests to use config_id and config_version i…
avirajsingh7 Dec 10, 2025
8f9561c
Update EvaluationRunPublic model to allow nullable config_id and conf…
avirajsingh7 Dec 10, 2025
f612da4
Refactor evaluation run model handling: remove model field, add resol…
avirajsingh7 Dec 15, 2025
4f89f43
fix migration number
avirajsingh7 Dec 15, 2025
82bee43
fix test
avirajsingh7 Dec 15, 2025
a2c8a95
fix status code
avirajsingh7 Dec 15, 2025
b9fd664
remove old mirgation
avirajsingh7 Dec 15, 2025
31d9523
Merge branch 'main' into evals/config_addition
nishika26 Jan 7, 2026
6b00e0f
added depends as import
AkhileshNegi Jan 12, 2026
ceb3970
fix: spread config object while building batch eval jsonl
Prajna1999 Jan 13, 2026
82c7b70
chore: remove audio poc code
Prajna1999 Jan 14, 2026
ebdda81
fix: add comprehensive expansion of 'tools' key while building evalua…
Prajna1999 Jan 14, 2026
3faa3ab
fix: merge conflict resolution old eval
Prajna1999 Jan 20, 2026
866443c
Merge main into evals/config_addition and update to use config_id/ver…
AkhileshNegi Jan 24, 2026
a29bb77
fix: resolve merge conflicts after pull
AkhileshNegi Jan 24, 2026
f00e7e0
fixing endpoints
AkhileshNegi Jan 24, 2026
e772e50
updated testcases
AkhileshNegi Jan 25, 2026
92e2fea
defining constants
AkhileshNegi Jan 25, 2026
1e01a9f
minor cleanups
AkhileshNegi Jan 25, 2026
d96809e
cleanup config blob
AkhileshNegi Jan 25, 2026
b1bc453
coderabbit suggestions
AkhileshNegi Jan 26, 2026
4109869
cleanup migration
AkhileshNegi Jan 26, 2026
4e552da
cleanup
AkhileshNegi Jan 26, 2026
85945b2
update config setup
AkhileshNegi Jan 26, 2026
e683ad2
coderabbit suggestions
AkhileshNegi Jan 26, 2026
9569e6a
Merge branch 'main' into evals/config_addition
AkhileshNegi Jan 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions backend/app/alembic/versions/041_add_config_in_evals_run_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""add config in evals run table

Revision ID: 041
Revises: 040
Create Date: 2025-12-15 14:03:22.082746

"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision = "041"
down_revision = "040"
branch_labels = None
depends_on = None


def upgrade():
op.add_column(
"evaluation_run",
sa.Column(
"config_id",
sa.Uuid(),
nullable=True,
comment="Reference to the stored config used",
),
)
op.add_column(
"evaluation_run",
sa.Column(
"config_version",
sa.Integer(),
nullable=True,
comment="Version of the config used",
),
)
op.create_foreign_key(
"fk_evaluation_run_config_id", "evaluation_run", "config", ["config_id"], ["id"]
)
op.drop_column("evaluation_run", "config")
Comment on lines +20 to +41
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical: Data loss and foreign key constraint naming issues.

This migration has several critical problems:

  1. Data loss: Line 41 drops the config column without migrating existing data to the new config_id/config_version columns. Any existing evaluation runs will lose their configuration data permanently.

  2. Foreign key constraint naming: Line 40 creates a foreign key with None as the constraint name, causing Alembic to auto-generate a name. However, the downgrade function (Line 57) also uses None to drop the constraint, which won't match the auto-generated name and will fail.

Required actions:

  1. Add a data migration step before dropping the config column. You'll need to:

    • Parse each existing config JSONB object
    • Look up or create corresponding config records with appropriate versions
    • Update config_id and config_version for each evaluation_run
    • Or, if data migration isn't feasible, add a comment explaining why data loss is acceptable
  2. Specify an explicit constraint name instead of None:

🔧 Proposed fix for FK constraint naming
-    op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"])
+    op.create_foreign_key(
+        "fk_evaluation_run_config_id", 
+        "evaluation_run", 
+        "config", 
+        ["config_id"], 
+        ["id"]
+    )

And update the downgrade:

-    op.drop_constraint(None, "evaluation_run", type_="foreignkey")
+    op.drop_constraint("fk_evaluation_run_config_id", "evaluation_run", type_="foreignkey")

Committable suggestion skipped: line range outside the PR's diff.



def downgrade():
op.add_column(
"evaluation_run",
sa.Column(
"config",
postgresql.JSONB(astext_type=sa.Text()),
autoincrement=False,
nullable=False,
comment="Evaluation configuration (model, instructions, etc.)",
),
)
op.drop_constraint(
"fk_evaluation_run_config_id", "evaluation_run", type_="foreignkey"
)
op.drop_column("evaluation_run", "config_version")
op.drop_column("evaluation_run", "config_id")
4 changes: 2 additions & 2 deletions backend/app/api/routes/evaluations/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def _dataset_to_response(dataset: EvaluationDataset) -> DatasetUploadResponse:


@router.post(
"/",
"",
description=load_description("evaluation/upload_dataset.md"),
response_model=APIResponse[DatasetUploadResponse],
dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
Expand Down Expand Up @@ -87,7 +87,7 @@ async def upload_dataset(


@router.get(
"/",
"",
description=load_description("evaluation/list_datasets.md"),
response_model=APIResponse[list[DatasetUploadResponse]],
dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
Expand Down
16 changes: 7 additions & 9 deletions backend/app/api/routes/evaluations/evaluation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Evaluation run API routes."""

import logging
from uuid import UUID

from fastapi import (
APIRouter,
Expand Down Expand Up @@ -29,7 +30,7 @@


@router.post(
"/",
"",
description=load_description("evaluation/create_evaluation.md"),
response_model=APIResponse[EvaluationRunPublic],
dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
Expand All @@ -41,19 +42,16 @@ def evaluate(
experiment_name: str = Body(
..., description="Name for this evaluation experiment/run"
),
config: dict = Body(default_factory=dict, description="Evaluation configuration"),
assistant_id: str
| None = Body(
None, description="Optional assistant ID to fetch configuration from"
),
config_id: UUID = Body(..., description="Stored config ID"),
config_version: int = Body(..., ge=1, description="Stored config version"),
) -> APIResponse[EvaluationRunPublic]:
"""Start an evaluation run."""
eval_run = start_evaluation(
session=_session,
dataset_id=dataset_id,
experiment_name=experiment_name,
config=config,
assistant_id=assistant_id,
config_id=config_id,
config_version=config_version,
organization_id=auth_context.organization_.id,
project_id=auth_context.project_.id,
)
Expand All @@ -68,7 +66,7 @@ def evaluate(


@router.get(
"/",
"",
description=load_description("evaluation/list_evaluations.md"),
response_model=APIResponse[list[EvaluationRunPublic]],
dependencies=[Depends(require_permission(Permission.REQUIRE_PROJECT))],
Expand Down
44 changes: 44 additions & 0 deletions backend/app/crud/evaluations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
create_evaluation_run,
get_evaluation_run_by_id,
list_evaluation_runs,
resolve_evaluation_config,
resolve_model_from_config,
save_score,
)
from app.crud.evaluations.cron import (
Expand Down Expand Up @@ -43,3 +45,45 @@
TraceData,
TraceScore,
)

__all__ = [
# Core
"create_evaluation_run",
"get_evaluation_run_by_id",
"list_evaluation_runs",
"resolve_evaluation_config",
"resolve_model_from_config",
"save_score",
# Cron
"process_all_pending_evaluations",
"process_all_pending_evaluations_sync",
# Dataset
"create_evaluation_dataset",
"delete_dataset",
"get_dataset_by_id",
"list_datasets",
"upload_csv_to_object_store",
# Batch
"start_evaluation_batch",
# Processing
"check_and_process_evaluation",
"poll_all_pending_evaluations",
"process_completed_embedding_batch",
"process_completed_evaluation",
# Embeddings
"calculate_average_similarity",
"calculate_cosine_similarity",
"start_embedding_batch",
# Langfuse
"create_langfuse_dataset_run",
"fetch_trace_scores_from_langfuse",
"update_traces_with_cosine_scores",
"upload_dataset_to_langfuse",
# Score types
"CategoricalSummaryScore",
"EvaluationScore",
"NumericSummaryScore",
"SummaryScore",
"TraceData",
"TraceScore",
]
38 changes: 29 additions & 9 deletions backend/app/crud/evaluations/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from app.core.batch import OpenAIBatchProvider, start_batch_job
from app.models import EvaluationRun
from app.models.llm.request import KaapiLLMParams

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -59,7 +60,7 @@ def fetch_dataset_items(langfuse: Langfuse, dataset_name: str) -> list[dict[str,


def build_evaluation_jsonl(
dataset_items: list[dict[str, Any]], config: dict[str, Any]
dataset_items: list[dict[str, Any]], config: KaapiLLMParams
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To make sure Text, TTS and STT config works, KaapiLLMParams looks like this in the STT PR

class TextLLMParams(SQLModel):
    model: str
    instructions: str | None = Field(
        default=None,
    )
    knowledge_base_ids: list[str] | None = Field(
        default=None,
        description="List of vector store IDs to use for knowledge retrieval",
    )
    reasoning: Literal["low", "medium", "high"] | None = Field(
        default=None,
        description="Reasoning configuration or instructions",
    )
    temperature: float | None = Field(
        default=None,
        ge=0.0,
        le=2.0,
    )
    max_num_results: int | None = Field(
        default=None,
        ge=1,
        description="Maximum number of candidate results to return",
    )


class STTLLMParams(SQLModel):
    model: str
    instructions: str
    input_language: str | None = None
    output_language: str | None = None
    response_format: Literal["text"] | None = Field(
        None,
        description="Can take multiple response_format like text, json, verbose_json.",
    )
    temperature: float | None = Field(
        default=0.2,
        ge=0.0,
        le=2.0,
    )


class TTSLLMParams(SQLModel):
    model: str
    voice: str
    language: str
    response_format: Literal["mp3", "wav", "ogg"] | None = "wav"
    speed: float | None = Field(None, ge=0.25, le=4.0)


KaapiLLMParams = Union[TextLLMParams, STTLLMParams, TTSLLMParams]

Union of all these related pydantic models. May be we can add Any type to not force type safety atm and once the STT PR gets merged, plan accordingly.

) -> list[dict[str, Any]]:
"""
Build JSONL data for evaluation batch using OpenAI Responses API.
Expand Down Expand Up @@ -88,7 +89,6 @@ def build_evaluation_jsonl(
List of dictionaries (JSONL data)
"""
jsonl_data = []

for item in dataset_items:
# Extract question from input
question = item["input"].get("question", "")
Expand All @@ -100,14 +100,34 @@ def build_evaluation_jsonl(

# Build the batch request object for Responses API
# Use config as-is and only add the input field
body: dict[str, Any] = {
"model": config.model,
"instructions": config.instructions,
"temperature": config.temperature
if config.temperature is not None
else 0.01,
"input": question, # Add input from dataset
}

# Add reasoning only if provided
if config.reasoning:
body["reasoning"] = {"effort": config.reasoning}

# Add tools only if knowledge_base_ids are provided
if config.knowledge_base_ids:
body["tools"] = [
{
"type": "file_search",
"vector_store_ids": config.knowledge_base_ids,
"max_num_results": config.max_num_results or 20,
}
]

batch_request = {
"custom_id": item["id"],
"method": "POST",
"url": "/v1/responses",
"body": {
**config, # Use config as-is
"input": question, # Add input from dataset
},
"body": body,
}

jsonl_data.append(batch_request)
Expand All @@ -119,7 +139,7 @@ def start_evaluation_batch(
openai_client: OpenAI,
session: Session,
eval_run: EvaluationRun,
config: dict[str, Any],
config: KaapiLLMParams,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto as above

) -> EvaluationRun:
"""
Fetch data, build JSONL, and start evaluation batch.
Expand All @@ -132,7 +152,7 @@ def start_evaluation_batch(
openai_client: Configured OpenAI client
session: Database session
eval_run: EvaluationRun database object (with run_name, dataset_name, config)
config: Evaluation configuration dict with llm, instructions, vector_store_ids
config: KaapiLLMParams with model, instructions, knowledge_base_ids, etc.

Returns:
Updated EvaluationRun with batch_job_id populated
Expand Down Expand Up @@ -166,7 +186,7 @@ def start_evaluation_batch(
"description": f"Evaluation: {eval_run.run_name}",
"completion_window": "24h",
# Store complete config for reference
"evaluation_config": config,
"evaluation_config": config.model_dump(exclude_none=True),
}

# Step 5: Start batch job using generic infrastructure
Expand Down
Loading