-
Notifications
You must be signed in to change notification settings - Fork 10
Evaluation: Added type for dataset #641
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,248 @@ | ||
| from uuid import uuid4 | ||
|
|
||
| from sqlmodel import Session, select | ||
|
|
||
| from app.core.util import now | ||
| from app.crud.evaluations.core import ( | ||
| create_evaluation_run, | ||
| get_evaluation_run_by_id, | ||
| list_evaluation_runs, | ||
| ) | ||
| from app.crud.evaluations.dataset import create_evaluation_dataset | ||
| from app.models import EvaluationRun, Organization, Project | ||
| from app.models.stt_evaluation import EvaluationType | ||
|
|
||
|
|
||
| def _create_config(db: Session, project_id: int) -> tuple: | ||
| """Helper to create a config and config_version for evaluation runs.""" | ||
| from app.models.config import Config, ConfigVersion | ||
|
|
||
| config = Config( | ||
| name="test_config", | ||
| project_id=project_id, | ||
| inserted_at=now(), | ||
| updated_at=now(), | ||
| ) | ||
| db.add(config) | ||
| db.commit() | ||
| db.refresh(config) | ||
|
|
||
| config_version = ConfigVersion( | ||
| config_id=config.id, | ||
| version=1, | ||
| config_blob={"completion": {"params": {"model": "gpt-4o"}}}, | ||
| inserted_at=now(), | ||
| updated_at=now(), | ||
| ) | ||
| db.add(config_version) | ||
| db.commit() | ||
| db.refresh(config_version) | ||
|
|
||
| return config.id, config_version.version | ||
|
|
||
|
Comment on lines
+16
to
+42
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion | 🟠 Major Adopt factory fixtures instead of inline object construction in tests. This module repeats setup logic (org/project/dataset/config/run) and uses an ad-hoc helper; please move these into reusable factory fixtures under ♻️ Example direction- def _create_config(db: Session, project_id: int) -> tuple:
- ...
+ # backend/app/tests/factories/config_factory.py
+ def create_config_with_version(db: Session, project_id: int) -> tuple[int, int]:
+ ...- config_id, config_version = _create_config(db, project.id)
+ config_id, config_version = config_factory.create_config_with_version(db, project.id)As per coding guidelines, " Also applies to: 47-245 🤖 Prompt for AI Agents |
||
|
|
||
| class TestCreateEvaluationRun: | ||
| """Test creating evaluation runs.""" | ||
|
|
||
| def test_create_evaluation_run_sets_text_type(self, db: Session) -> None: | ||
| """Test that create_evaluation_run sets type to TEXT.""" | ||
| org = db.exec(select(Organization)).first() | ||
| project = db.exec( | ||
| select(Project).where(Project.organization_id == org.id) | ||
| ).first() | ||
|
|
||
| dataset = create_evaluation_dataset( | ||
| session=db, | ||
| name="test_dataset_run_type", | ||
| dataset_metadata={"original_items_count": 10}, | ||
| organization_id=org.id, | ||
| project_id=project.id, | ||
| ) | ||
|
|
||
| config_id, config_version = _create_config(db, project.id) | ||
|
|
||
| eval_run = create_evaluation_run( | ||
| session=db, | ||
| run_name="test_run", | ||
| dataset_name=dataset.name, | ||
| dataset_id=dataset.id, | ||
| config_id=config_id, | ||
| config_version=config_version, | ||
| organization_id=org.id, | ||
| project_id=project.id, | ||
| ) | ||
|
|
||
| assert eval_run.id is not None | ||
| assert eval_run.type == EvaluationType.TEXT.value | ||
| assert eval_run.status == "pending" | ||
| assert eval_run.run_name == "test_run" | ||
|
|
||
|
|
||
| class TestGetEvaluationRunById: | ||
| """Test fetching evaluation runs by ID.""" | ||
|
|
||
| def test_get_evaluation_run_by_id_success(self, db: Session) -> None: | ||
| """Test fetching an existing evaluation run by ID.""" | ||
| org = db.exec(select(Organization)).first() | ||
| project = db.exec( | ||
| select(Project).where(Project.organization_id == org.id) | ||
| ).first() | ||
|
|
||
| dataset = create_evaluation_dataset( | ||
| session=db, | ||
| name="test_dataset_get_run", | ||
| dataset_metadata={"original_items_count": 10}, | ||
| organization_id=org.id, | ||
| project_id=project.id, | ||
| ) | ||
|
|
||
| config_id, config_version = _create_config(db, project.id) | ||
|
|
||
| eval_run = create_evaluation_run( | ||
| session=db, | ||
| run_name="test_get_run", | ||
| dataset_name=dataset.name, | ||
| dataset_id=dataset.id, | ||
| config_id=config_id, | ||
| config_version=config_version, | ||
| organization_id=org.id, | ||
| project_id=project.id, | ||
| ) | ||
|
|
||
| fetched = get_evaluation_run_by_id( | ||
| session=db, | ||
| evaluation_id=eval_run.id, | ||
| organization_id=org.id, | ||
| project_id=project.id, | ||
| ) | ||
|
|
||
| assert fetched is not None | ||
| assert fetched.id == eval_run.id | ||
| assert fetched.run_name == "test_get_run" | ||
|
|
||
| def test_get_evaluation_run_by_id_not_found(self, db: Session) -> None: | ||
| """Test fetching a non-existent evaluation run.""" | ||
| org = db.exec(select(Organization)).first() | ||
| project = db.exec( | ||
| select(Project).where(Project.organization_id == org.id) | ||
| ).first() | ||
|
|
||
| fetched = get_evaluation_run_by_id( | ||
| session=db, | ||
| evaluation_id=99999, | ||
| organization_id=org.id, | ||
| project_id=project.id, | ||
| ) | ||
|
|
||
| assert fetched is None | ||
|
|
||
| def test_get_evaluation_run_by_id_excludes_non_text_type(self, db: Session) -> None: | ||
| """Test that get_evaluation_run_by_id excludes runs with non-text type.""" | ||
| org = db.exec(select(Organization)).first() | ||
| project = db.exec( | ||
| select(Project).where(Project.organization_id == org.id) | ||
| ).first() | ||
|
|
||
| dataset = create_evaluation_dataset( | ||
| session=db, | ||
| name="test_dataset_exclude_run", | ||
| dataset_metadata={"original_items_count": 10}, | ||
| organization_id=org.id, | ||
| project_id=project.id, | ||
| ) | ||
|
|
||
| config_id, config_version = _create_config(db, project.id) | ||
|
|
||
| eval_run = create_evaluation_run( | ||
| session=db, | ||
| run_name="test_stt_run", | ||
| dataset_name=dataset.name, | ||
| dataset_id=dataset.id, | ||
| config_id=config_id, | ||
| config_version=config_version, | ||
| organization_id=org.id, | ||
| project_id=project.id, | ||
| ) | ||
|
|
||
| # Manually update type to STT to simulate a non-text run | ||
| eval_run.type = EvaluationType.STT.value | ||
| db.add(eval_run) | ||
| db.commit() | ||
|
|
||
| fetched = get_evaluation_run_by_id( | ||
| session=db, | ||
| evaluation_id=eval_run.id, | ||
| organization_id=org.id, | ||
| project_id=project.id, | ||
| ) | ||
|
|
||
| assert fetched is None | ||
|
|
||
|
|
||
| class TestListEvaluationRuns: | ||
| """Test listing evaluation runs.""" | ||
|
|
||
| def test_list_evaluation_runs_empty(self, db: Session) -> None: | ||
| """Test listing evaluation runs when none exist.""" | ||
| org = db.exec(select(Organization)).first() | ||
| project = db.exec( | ||
| select(Project).where(Project.organization_id == org.id) | ||
| ).first() | ||
|
|
||
| runs = list_evaluation_runs( | ||
| session=db, organization_id=org.id, project_id=project.id | ||
| ) | ||
|
|
||
| assert len(runs) == 0 | ||
|
|
||
| def test_list_evaluation_runs_excludes_non_text_type(self, db: Session) -> None: | ||
| """Test that list_evaluation_runs only returns text type runs.""" | ||
| org = db.exec(select(Organization)).first() | ||
| project = db.exec( | ||
| select(Project).where(Project.organization_id == org.id) | ||
| ).first() | ||
|
|
||
| dataset = create_evaluation_dataset( | ||
| session=db, | ||
| name="test_dataset_list_runs", | ||
| dataset_metadata={"original_items_count": 10}, | ||
| organization_id=org.id, | ||
| project_id=project.id, | ||
| ) | ||
|
|
||
| config_id, config_version = _create_config(db, project.id) | ||
|
|
||
| # Create text evaluation runs | ||
| for i in range(3): | ||
| create_evaluation_run( | ||
| session=db, | ||
| run_name=f"text_run_{i}", | ||
| dataset_name=dataset.name, | ||
| dataset_id=dataset.id, | ||
| config_id=config_id, | ||
| config_version=config_version, | ||
| organization_id=org.id, | ||
| project_id=project.id, | ||
| ) | ||
|
|
||
| # Create a non-text evaluation run by updating type after creation | ||
| stt_run = create_evaluation_run( | ||
| session=db, | ||
| run_name="stt_run", | ||
| dataset_name=dataset.name, | ||
| dataset_id=dataset.id, | ||
| config_id=config_id, | ||
| config_version=config_version, | ||
| organization_id=org.id, | ||
| project_id=project.id, | ||
| ) | ||
| stt_run.type = EvaluationType.STT.value | ||
| db.add(stt_run) | ||
| db.commit() | ||
|
|
||
| runs = list_evaluation_runs( | ||
| session=db, organization_id=org.id, project_id=project.id | ||
| ) | ||
|
|
||
| assert len(runs) == 3 | ||
| assert all(r.type == EvaluationType.TEXT.value for r in runs) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hard-coded
TEXTscope makes generic dataset CRUD return false “not found” for valid non-TEXT datasets.These changes force dataset creation and all reads/lists to
TEXTonly. GivenEvaluationTypeincludesSTTandTTS(backend/app/models/stt_evaluation.py:21-26), this module now silently excludes those datasets and can surface misleading 404s in callers likestart_evaluation(backend/app/services/evaluations/evaluation.py:28-85).Suggested fix (parameterize dataset type, keep TEXT default)
def create_evaluation_dataset( session: Session, name: str, dataset_metadata: dict[str, Any], organization_id: int, project_id: int, description: str | None = None, object_store_url: str | None = None, langfuse_dataset_id: str | None = None, + evaluation_type: EvaluationType = EvaluationType.TEXT, ) -> EvaluationDataset: @@ dataset = EvaluationDataset( name=name, description=description, - type=EvaluationType.TEXT.value, + type=evaluation_type.value, dataset_metadata=dataset_metadata, @@ def get_dataset_by_id( - session: Session, dataset_id: int, organization_id: int, project_id: int + session: Session, + dataset_id: int, + organization_id: int, + project_id: int, + evaluation_type: EvaluationType = EvaluationType.TEXT, ) -> EvaluationDataset | None: @@ .where(EvaluationDataset.organization_id == organization_id) .where(EvaluationDataset.project_id == project_id) - .where(EvaluationDataset.type == EvaluationType.TEXT.value) + .where(EvaluationDataset.type == evaluation_type.value) @@ def get_dataset_by_name( - session: Session, name: str, organization_id: int, project_id: int + session: Session, + name: str, + organization_id: int, + project_id: int, + evaluation_type: EvaluationType = EvaluationType.TEXT, ) -> EvaluationDataset | None: @@ .where(EvaluationDataset.organization_id == organization_id) .where(EvaluationDataset.project_id == project_id) - .where(EvaluationDataset.type == EvaluationType.TEXT.value) + .where(EvaluationDataset.type == evaluation_type.value) @@ def list_datasets( session: Session, organization_id: int, project_id: int, limit: int = 50, offset: int = 0, + evaluation_type: EvaluationType = EvaluationType.TEXT, ) -> list[EvaluationDataset]: @@ .where(EvaluationDataset.organization_id == organization_id) .where(EvaluationDataset.project_id == project_id) - .where(EvaluationDataset.type == EvaluationType.TEXT.value) + .where(EvaluationDataset.type == evaluation_type.value)Also applies to: 127-127, 164-164, 201-201
🤖 Prompt for AI Agents