Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions backend/app/crud/evaluations/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from app.crud.evaluations.score import EvaluationScore
from app.models import EvaluationRun
from app.models.llm.request import ConfigBlob, LLMCallConfig
from app.models.stt_evaluation import EvaluationType
from app.services.llm.jobs import resolve_config_blob

from app.core.db import engine
Expand Down Expand Up @@ -80,6 +81,7 @@ def create_evaluation_run(
run_name=run_name,
dataset_name=dataset_name,
dataset_id=dataset_id,
type=EvaluationType.TEXT.value,
config_id=config_id,
config_version=config_version,
status="pending",
Expand Down Expand Up @@ -129,6 +131,7 @@ def list_evaluation_runs(
select(EvaluationRun)
.where(EvaluationRun.organization_id == organization_id)
.where(EvaluationRun.project_id == project_id)
.where(EvaluationRun.type == EvaluationType.TEXT.value)
.order_by(EvaluationRun.inserted_at.desc())
.limit(limit)
.offset(offset)
Expand Down Expand Up @@ -167,6 +170,7 @@ def get_evaluation_run_by_id(
.where(EvaluationRun.id == evaluation_id)
.where(EvaluationRun.organization_id == organization_id)
.where(EvaluationRun.project_id == project_id)
.where(EvaluationRun.type == EvaluationType.TEXT.value)
)

eval_run = session.exec(statement).first()
Expand Down
5 changes: 5 additions & 0 deletions backend/app/crud/evaluations/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
)
from app.core.util import now
from app.models import EvaluationDataset, EvaluationRun
from app.models.stt_evaluation import EvaluationType

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -60,6 +61,7 @@ def create_evaluation_dataset(
dataset = EvaluationDataset(
name=name,
description=description,
type=EvaluationType.TEXT.value,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Hard-coded TEXT scope makes generic dataset CRUD return false “not found” for valid non-TEXT datasets.

These changes force dataset creation and all reads/lists to TEXT only. Given EvaluationType includes STT and TTS (backend/app/models/stt_evaluation.py:21-26), this module now silently excludes those datasets and can surface misleading 404s in callers like start_evaluation (backend/app/services/evaluations/evaluation.py:28-85).

Suggested fix (parameterize dataset type, keep TEXT default)
 def create_evaluation_dataset(
     session: Session,
     name: str,
     dataset_metadata: dict[str, Any],
     organization_id: int,
     project_id: int,
     description: str | None = None,
     object_store_url: str | None = None,
     langfuse_dataset_id: str | None = None,
+    evaluation_type: EvaluationType = EvaluationType.TEXT,
 ) -> EvaluationDataset:
@@
         dataset = EvaluationDataset(
             name=name,
             description=description,
-            type=EvaluationType.TEXT.value,
+            type=evaluation_type.value,
             dataset_metadata=dataset_metadata,
@@
 def get_dataset_by_id(
-    session: Session, dataset_id: int, organization_id: int, project_id: int
+    session: Session,
+    dataset_id: int,
+    organization_id: int,
+    project_id: int,
+    evaluation_type: EvaluationType = EvaluationType.TEXT,
 ) -> EvaluationDataset | None:
@@
         .where(EvaluationDataset.organization_id == organization_id)
         .where(EvaluationDataset.project_id == project_id)
-        .where(EvaluationDataset.type == EvaluationType.TEXT.value)
+        .where(EvaluationDataset.type == evaluation_type.value)
@@
 def get_dataset_by_name(
-    session: Session, name: str, organization_id: int, project_id: int
+    session: Session,
+    name: str,
+    organization_id: int,
+    project_id: int,
+    evaluation_type: EvaluationType = EvaluationType.TEXT,
 ) -> EvaluationDataset | None:
@@
         .where(EvaluationDataset.organization_id == organization_id)
         .where(EvaluationDataset.project_id == project_id)
-        .where(EvaluationDataset.type == EvaluationType.TEXT.value)
+        .where(EvaluationDataset.type == evaluation_type.value)
@@
 def list_datasets(
     session: Session,
     organization_id: int,
     project_id: int,
     limit: int = 50,
     offset: int = 0,
+    evaluation_type: EvaluationType = EvaluationType.TEXT,
 ) -> list[EvaluationDataset]:
@@
         .where(EvaluationDataset.organization_id == organization_id)
         .where(EvaluationDataset.project_id == project_id)
-        .where(EvaluationDataset.type == EvaluationType.TEXT.value)
+        .where(EvaluationDataset.type == evaluation_type.value)

Also applies to: 127-127, 164-164, 201-201

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/app/crud/evaluations/dataset.py` at line 64, The CRUD functions in
this module are hard-coding type=EvaluationType.TEXT.value causing non-TEXT
datasets to be ignored; change the dataset CRUD functions (e.g., the
create/read/list functions that currently use type=EvaluationType.TEXT.value at
the occurrences around the file) to accept an optional dataset_type parameter
(defaulting to EvaluationType.TEXT) and use dataset_type.value when
querying/creating, and update any internal calls to pass through the correct
EvaluationType (for example, have callers like start_evaluation pass the
incoming evaluation type or dataset.evaluation_type instead of relying on the
hard-coded TEXT); apply this change for each occurrence noted (around the three
lines mentioned) so STT/TTS datasets are included while preserving TEXT as the
default.

dataset_metadata=dataset_metadata,
object_store_url=object_store_url,
langfuse_dataset_id=langfuse_dataset_id,
Expand Down Expand Up @@ -122,6 +124,7 @@ def get_dataset_by_id(
.where(EvaluationDataset.id == dataset_id)
.where(EvaluationDataset.organization_id == organization_id)
.where(EvaluationDataset.project_id == project_id)
.where(EvaluationDataset.type == EvaluationType.TEXT.value)
)

dataset = session.exec(statement).first()
Expand Down Expand Up @@ -158,6 +161,7 @@ def get_dataset_by_name(
.where(EvaluationDataset.name == name)
.where(EvaluationDataset.organization_id == organization_id)
.where(EvaluationDataset.project_id == project_id)
.where(EvaluationDataset.type == EvaluationType.TEXT.value)
)

dataset = session.exec(statement).first()
Expand Down Expand Up @@ -194,6 +198,7 @@ def list_datasets(
select(EvaluationDataset)
.where(EvaluationDataset.organization_id == organization_id)
.where(EvaluationDataset.project_id == project_id)
.where(EvaluationDataset.type == EvaluationType.TEXT.value)
.order_by(EvaluationDataset.inserted_at.desc())
.limit(limit)
.offset(offset)
Expand Down
248 changes: 248 additions & 0 deletions backend/app/tests/crud/evaluations/test_core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
from uuid import uuid4

from sqlmodel import Session, select

from app.core.util import now
from app.crud.evaluations.core import (
create_evaluation_run,
get_evaluation_run_by_id,
list_evaluation_runs,
)
from app.crud.evaluations.dataset import create_evaluation_dataset
from app.models import EvaluationRun, Organization, Project
from app.models.stt_evaluation import EvaluationType


def _create_config(db: Session, project_id: int) -> tuple:
"""Helper to create a config and config_version for evaluation runs."""
from app.models.config import Config, ConfigVersion

config = Config(
name="test_config",
project_id=project_id,
inserted_at=now(),
updated_at=now(),
)
db.add(config)
db.commit()
db.refresh(config)

config_version = ConfigVersion(
config_id=config.id,
version=1,
config_blob={"completion": {"params": {"model": "gpt-4o"}}},
inserted_at=now(),
updated_at=now(),
)
db.add(config_version)
db.commit()
db.refresh(config_version)

return config.id, config_version.version

Comment on lines +16 to +42
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

Adopt factory fixtures instead of inline object construction in tests.

This module repeats setup logic (org/project/dataset/config/run) and uses an ad-hoc helper; please move these into reusable factory fixtures under backend/app/tests/ for consistency and maintainability.

♻️ Example direction
- def _create_config(db: Session, project_id: int) -> tuple:
-     ...
+ # backend/app/tests/factories/config_factory.py
+ def create_config_with_version(db: Session, project_id: int) -> tuple[int, int]:
+     ...
- config_id, config_version = _create_config(db, project.id)
+ config_id, config_version = config_factory.create_config_with_version(db, project.id)

As per coding guidelines, "backend/app/tests/**/*.py: Use factory pattern for test fixtures in backend/app/tests/."

Also applies to: 47-245

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/app/tests/crud/evaluations/test_core.py` around lines 16 - 42, The
inline helper _create_config and repeated setup logic should be replaced by
reusable factory fixtures under backend/app/tests/ (e.g.,
ConfigFactory/ConfigVersionFactory, ProjectFactory, OrgFactory, DatasetFactory,
RunFactory); create factories that construct Config and ConfigVersion (mirroring
the fields set in _create_config and using now() for timestamps), register them
as pytest fixtures, and update tests in test_core.py (and other affected tests)
to use these fixtures instead of calling _create_config or manually creating
models; ensure factories return the same identifiers/objects (config.id and
config_version.version) or provide equivalent attributes so existing assertions
in the tests remain valid.


class TestCreateEvaluationRun:
"""Test creating evaluation runs."""

def test_create_evaluation_run_sets_text_type(self, db: Session) -> None:
"""Test that create_evaluation_run sets type to TEXT."""
org = db.exec(select(Organization)).first()
project = db.exec(
select(Project).where(Project.organization_id == org.id)
).first()

dataset = create_evaluation_dataset(
session=db,
name="test_dataset_run_type",
dataset_metadata={"original_items_count": 10},
organization_id=org.id,
project_id=project.id,
)

config_id, config_version = _create_config(db, project.id)

eval_run = create_evaluation_run(
session=db,
run_name="test_run",
dataset_name=dataset.name,
dataset_id=dataset.id,
config_id=config_id,
config_version=config_version,
organization_id=org.id,
project_id=project.id,
)

assert eval_run.id is not None
assert eval_run.type == EvaluationType.TEXT.value
assert eval_run.status == "pending"
assert eval_run.run_name == "test_run"


class TestGetEvaluationRunById:
"""Test fetching evaluation runs by ID."""

def test_get_evaluation_run_by_id_success(self, db: Session) -> None:
"""Test fetching an existing evaluation run by ID."""
org = db.exec(select(Organization)).first()
project = db.exec(
select(Project).where(Project.organization_id == org.id)
).first()

dataset = create_evaluation_dataset(
session=db,
name="test_dataset_get_run",
dataset_metadata={"original_items_count": 10},
organization_id=org.id,
project_id=project.id,
)

config_id, config_version = _create_config(db, project.id)

eval_run = create_evaluation_run(
session=db,
run_name="test_get_run",
dataset_name=dataset.name,
dataset_id=dataset.id,
config_id=config_id,
config_version=config_version,
organization_id=org.id,
project_id=project.id,
)

fetched = get_evaluation_run_by_id(
session=db,
evaluation_id=eval_run.id,
organization_id=org.id,
project_id=project.id,
)

assert fetched is not None
assert fetched.id == eval_run.id
assert fetched.run_name == "test_get_run"

def test_get_evaluation_run_by_id_not_found(self, db: Session) -> None:
"""Test fetching a non-existent evaluation run."""
org = db.exec(select(Organization)).first()
project = db.exec(
select(Project).where(Project.organization_id == org.id)
).first()

fetched = get_evaluation_run_by_id(
session=db,
evaluation_id=99999,
organization_id=org.id,
project_id=project.id,
)

assert fetched is None

def test_get_evaluation_run_by_id_excludes_non_text_type(self, db: Session) -> None:
"""Test that get_evaluation_run_by_id excludes runs with non-text type."""
org = db.exec(select(Organization)).first()
project = db.exec(
select(Project).where(Project.organization_id == org.id)
).first()

dataset = create_evaluation_dataset(
session=db,
name="test_dataset_exclude_run",
dataset_metadata={"original_items_count": 10},
organization_id=org.id,
project_id=project.id,
)

config_id, config_version = _create_config(db, project.id)

eval_run = create_evaluation_run(
session=db,
run_name="test_stt_run",
dataset_name=dataset.name,
dataset_id=dataset.id,
config_id=config_id,
config_version=config_version,
organization_id=org.id,
project_id=project.id,
)

# Manually update type to STT to simulate a non-text run
eval_run.type = EvaluationType.STT.value
db.add(eval_run)
db.commit()

fetched = get_evaluation_run_by_id(
session=db,
evaluation_id=eval_run.id,
organization_id=org.id,
project_id=project.id,
)

assert fetched is None


class TestListEvaluationRuns:
"""Test listing evaluation runs."""

def test_list_evaluation_runs_empty(self, db: Session) -> None:
"""Test listing evaluation runs when none exist."""
org = db.exec(select(Organization)).first()
project = db.exec(
select(Project).where(Project.organization_id == org.id)
).first()

runs = list_evaluation_runs(
session=db, organization_id=org.id, project_id=project.id
)

assert len(runs) == 0

def test_list_evaluation_runs_excludes_non_text_type(self, db: Session) -> None:
"""Test that list_evaluation_runs only returns text type runs."""
org = db.exec(select(Organization)).first()
project = db.exec(
select(Project).where(Project.organization_id == org.id)
).first()

dataset = create_evaluation_dataset(
session=db,
name="test_dataset_list_runs",
dataset_metadata={"original_items_count": 10},
organization_id=org.id,
project_id=project.id,
)

config_id, config_version = _create_config(db, project.id)

# Create text evaluation runs
for i in range(3):
create_evaluation_run(
session=db,
run_name=f"text_run_{i}",
dataset_name=dataset.name,
dataset_id=dataset.id,
config_id=config_id,
config_version=config_version,
organization_id=org.id,
project_id=project.id,
)

# Create a non-text evaluation run by updating type after creation
stt_run = create_evaluation_run(
session=db,
run_name="stt_run",
dataset_name=dataset.name,
dataset_id=dataset.id,
config_id=config_id,
config_version=config_version,
organization_id=org.id,
project_id=project.id,
)
stt_run.type = EvaluationType.STT.value
db.add(stt_run)
db.commit()

runs = list_evaluation_runs(
session=db, organization_id=org.id, project_id=project.id
)

assert len(runs) == 3
assert all(r.type == EvaluationType.TEXT.value for r in runs)
Loading