-
Notifications
You must be signed in to change notification settings - Fork 10
Evaluation: Use Config Management #477
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3f8ddcf
5280622
13eb778
7bdd322
8f9561c
f612da4
4f89f43
82bee43
a2c8a95
b9fd664
31d9523
6b00e0f
ceb3970
82c7b70
ebdda81
3faa3ab
866443c
a29bb77
f00e7e0
e772e50
92e2fea
1e01a9f
d96809e
b1bc453
4109869
4e552da
85945b2
e683ad2
9569e6a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| """add config in evals run table | ||
|
|
||
| Revision ID: 041 | ||
| Revises: 040 | ||
| Create Date: 2025-12-15 14:03:22.082746 | ||
|
|
||
| """ | ||
| from alembic import op | ||
| import sqlalchemy as sa | ||
| from sqlalchemy.dialects import postgresql | ||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision = "041" | ||
| down_revision = "040" | ||
| branch_labels = None | ||
| depends_on = None | ||
|
|
||
|
|
||
| def upgrade(): | ||
| op.add_column( | ||
| "evaluation_run", | ||
| sa.Column( | ||
| "config_id", | ||
| sa.Uuid(), | ||
| nullable=True, | ||
| comment="Reference to the stored config used", | ||
| ), | ||
| ) | ||
| op.add_column( | ||
| "evaluation_run", | ||
| sa.Column( | ||
| "config_version", | ||
| sa.Integer(), | ||
| nullable=True, | ||
| comment="Version of the config used", | ||
| ), | ||
| ) | ||
| op.create_foreign_key( | ||
| "fk_evaluation_run_config_id", "evaluation_run", "config", ["config_id"], ["id"] | ||
| ) | ||
| op.drop_column("evaluation_run", "config") | ||
|
Comment on lines
+20
to
+41
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Critical: Data loss and foreign key constraint naming issues. This migration has several critical problems:
Required actions:
🔧 Proposed fix for FK constraint naming- op.create_foreign_key(None, "evaluation_run", "config", ["config_id"], ["id"])
+ op.create_foreign_key(
+ "fk_evaluation_run_config_id",
+ "evaluation_run",
+ "config",
+ ["config_id"],
+ ["id"]
+ )And update the downgrade: - op.drop_constraint(None, "evaluation_run", type_="foreignkey")
+ op.drop_constraint("fk_evaluation_run_config_id", "evaluation_run", type_="foreignkey")
|
||
|
|
||
|
|
||
| def downgrade(): | ||
| op.add_column( | ||
| "evaluation_run", | ||
| sa.Column( | ||
| "config", | ||
| postgresql.JSONB(astext_type=sa.Text()), | ||
| autoincrement=False, | ||
| nullable=False, | ||
| comment="Evaluation configuration (model, instructions, etc.)", | ||
| ), | ||
| ) | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| op.drop_constraint( | ||
| "fk_evaluation_run_config_id", "evaluation_run", type_="foreignkey" | ||
| ) | ||
| op.drop_column("evaluation_run", "config_version") | ||
| op.drop_column("evaluation_run", "config_id") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,7 @@ | |
|
|
||
| from app.core.batch import OpenAIBatchProvider, start_batch_job | ||
| from app.models import EvaluationRun | ||
| from app.models.llm.request import KaapiLLMParams | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
@@ -59,7 +60,7 @@ def fetch_dataset_items(langfuse: Langfuse, dataset_name: str) -> list[dict[str, | |
|
|
||
|
|
||
| def build_evaluation_jsonl( | ||
| dataset_items: list[dict[str, Any]], config: dict[str, Any] | ||
| dataset_items: list[dict[str, Any]], config: KaapiLLMParams | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To make sure Text, TTS and STT config works, Union of all these related pydantic models. May be we can add |
||
| ) -> list[dict[str, Any]]: | ||
| """ | ||
| Build JSONL data for evaluation batch using OpenAI Responses API. | ||
|
|
@@ -88,7 +89,6 @@ def build_evaluation_jsonl( | |
| List of dictionaries (JSONL data) | ||
| """ | ||
| jsonl_data = [] | ||
|
|
||
| for item in dataset_items: | ||
| # Extract question from input | ||
| question = item["input"].get("question", "") | ||
|
|
@@ -100,14 +100,34 @@ def build_evaluation_jsonl( | |
|
|
||
| # Build the batch request object for Responses API | ||
| # Use config as-is and only add the input field | ||
| body: dict[str, Any] = { | ||
| "model": config.model, | ||
| "instructions": config.instructions, | ||
| "temperature": config.temperature | ||
| if config.temperature is not None | ||
| else 0.01, | ||
| "input": question, # Add input from dataset | ||
| } | ||
|
|
||
| # Add reasoning only if provided | ||
| if config.reasoning: | ||
| body["reasoning"] = {"effort": config.reasoning} | ||
|
|
||
| # Add tools only if knowledge_base_ids are provided | ||
| if config.knowledge_base_ids: | ||
| body["tools"] = [ | ||
| { | ||
| "type": "file_search", | ||
| "vector_store_ids": config.knowledge_base_ids, | ||
| "max_num_results": config.max_num_results or 20, | ||
| } | ||
| ] | ||
|
|
||
| batch_request = { | ||
| "custom_id": item["id"], | ||
| "method": "POST", | ||
| "url": "/v1/responses", | ||
| "body": { | ||
| **config, # Use config as-is | ||
| "input": question, # Add input from dataset | ||
| }, | ||
| "body": body, | ||
| } | ||
|
|
||
| jsonl_data.append(batch_request) | ||
|
|
@@ -119,7 +139,7 @@ def start_evaluation_batch( | |
| openai_client: OpenAI, | ||
| session: Session, | ||
| eval_run: EvaluationRun, | ||
| config: dict[str, Any], | ||
| config: KaapiLLMParams, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto as above |
||
| ) -> EvaluationRun: | ||
| """ | ||
| Fetch data, build JSONL, and start evaluation batch. | ||
|
|
@@ -132,7 +152,7 @@ def start_evaluation_batch( | |
| openai_client: Configured OpenAI client | ||
| session: Database session | ||
| eval_run: EvaluationRun database object (with run_name, dataset_name, config) | ||
| config: Evaluation configuration dict with llm, instructions, vector_store_ids | ||
| config: KaapiLLMParams with model, instructions, knowledge_base_ids, etc. | ||
|
|
||
| Returns: | ||
| Updated EvaluationRun with batch_job_id populated | ||
|
|
@@ -166,7 +186,7 @@ def start_evaluation_batch( | |
| "description": f"Evaluation: {eval_run.run_name}", | ||
| "completion_window": "24h", | ||
| # Store complete config for reference | ||
| "evaluation_config": config, | ||
| "evaluation_config": config.model_dump(exclude_none=True), | ||
| } | ||
|
|
||
| # Step 5: Start batch job using generic infrastructure | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.