Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
7ea5dad
Classification: db models and migration script (#305)
nishika26 Aug 5, 2025
6aeaa09
Merge branch 'main' into feature/classification
nishika26 Aug 5, 2025
d6e8964
Classification: Fine tuning Initiation and retrieve endpoint (#315)
nishika26 Aug 8, 2025
f31cbb1
Merge branch 'main' into feature/classification
nishika26 Aug 8, 2025
afaf258
Merge branch 'main' into feature/classification
nishika26 Aug 19, 2025
bcdf759
seperate session for bg task, and formating fixes
nishika26 Aug 19, 2025
1951239
Merge branch 'main' into feature/classification
nishika26 Aug 19, 2025
2a13142
fixing alembic revision
nishika26 Aug 19, 2025
e3ee5c3
Classification : Model evaluation of fine tuned models (#326)
nishika26 Aug 19, 2025
cb21303
fixing alembic revision
nishika26 Aug 19, 2025
c67362a
Merge branch 'main' into feature/classification
nishika26 Aug 21, 2025
734178e
alembic revision fix
nishika26 Aug 24, 2025
9407df0
Classification : train and test data to s3 (#343)
nishika26 Aug 28, 2025
3b21ecc
Merge branch 'main' into feature/classification
nishika26 Aug 28, 2025
12ddac9
updating alembic revision
nishika26 Aug 28, 2025
0884b43
formatting fix
nishika26 Aug 28, 2025
bcbaf20
Classification : retaining prediction and fetching data from s3 for m…
nishika26 Aug 29, 2025
46377cd
Merge branch 'main' into feature/classification
nishika26 Sep 1, 2025
634f591
single migration file
nishika26 Sep 1, 2025
4a8d0cb
status enum columns
nishika26 Sep 3, 2025
dc27edf
Merge branch 'main' into feature/classification
nishika26 Sep 3, 2025
f9f7b4e
document seeding
nishika26 Sep 3, 2025
e9d09e3
Classification : small fixes and storage related changes (#365)
nishika26 Sep 4, 2025
68954f2
Merge branch 'main' into feature/classification
nishika26 Sep 4, 2025
e484d6c
Merge branch 'main' into feature/classification
nishika26 Sep 4, 2025
1932825
fixing alembic revision
nishika26 Sep 4, 2025
4e7d30a
uv lock
nishika26 Sep 4, 2025
9a9e709
new uv lock file
nishika26 Sep 4, 2025
04868c5
updated uv lock file
nishika26 Sep 4, 2025
58cd080
coderabbit suggestions and removing unused imports
nishika26 Sep 4, 2025
dd99bd1
changes in uv lock file
nishika26 Sep 4, 2025
bfe272a
making csv a supported file format, changing uv lock and pyproject toml
nishika26 Sep 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""add fine tuning and model evaluation table

Revision ID: 6ed6ed401847
Revises: 40307ab77e9f
Create Date: 2025-09-01 14:54:03.553608

"""
from alembic import op
import sqlalchemy as sa
import sqlmodel.sql.sqltypes
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision = "6ed6ed401847"
down_revision = "9f8a4af9d6fd"
branch_labels = None
depends_on = None


finetuning_status_enum = postgresql.ENUM(
"pending",
"running",
"completed",
"failed",
name="finetuningstatus",
create_type=False,
)

modelevaluation_status_enum = postgresql.ENUM(
"pending",
"running",
"completed",
"failed",
name="modelevaluationstatus",
create_type=False,
)


def upgrade():
finetuning_status_enum.create(op.get_bind(), checkfirst=True)
modelevaluation_status_enum.create(op.get_bind(), checkfirst=True)
op.create_table(
"fine_tuning",
sa.Column("base_model", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
sa.Column("split_ratio", sa.Float(), nullable=False),
sa.Column("document_id", sa.Uuid(), nullable=False),
sa.Column(
"training_file_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True
),
sa.Column("system_prompt", sa.Text(), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("provider_job_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
sa.Column(
"status",
finetuning_status_enum,
nullable=False,
server_default="pending",
),
sa.Column(
"fine_tuned_model", sqlmodel.sql.sqltypes.AutoString(), nullable=True
),
sa.Column(
"train_data_s3_object", sqlmodel.sql.sqltypes.AutoString(), nullable=True
),
sa.Column(
"test_data_s3_object", sqlmodel.sql.sqltypes.AutoString(), nullable=True
),
sa.Column("error_message", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
sa.Column("project_id", sa.Integer(), nullable=False),
sa.Column("organization_id", sa.Integer(), nullable=False),
sa.Column("is_deleted", sa.Boolean(), nullable=False),
sa.Column("inserted_at", sa.DateTime(), nullable=False),
sa.Column("updated_at", sa.DateTime(), nullable=False),
sa.Column("deleted_at", sa.DateTime(), nullable=True),
sa.ForeignKeyConstraint(
["document_id"],
["document.id"],
),
sa.ForeignKeyConstraint(
["organization_id"], ["organization.id"], ondelete="CASCADE"
),
sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"),
sa.PrimaryKeyConstraint("id"),
)
op.create_table(
"model_evaluation",
sa.Column("fine_tuning_id", sa.Integer(), nullable=False),
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("document_id", sa.Uuid(), nullable=False),
sa.Column(
"fine_tuned_model", sqlmodel.sql.sqltypes.AutoString(), nullable=False
),
sa.Column(
"test_data_s3_object", sqlmodel.sql.sqltypes.AutoString(), nullable=False
),
sa.Column("base_model", sqlmodel.sql.sqltypes.AutoString(), nullable=False),
sa.Column("split_ratio", sa.Float(), nullable=False),
sa.Column("system_prompt", sa.Text(), nullable=False),
sa.Column("score", postgresql.JSON(astext_type=sa.Text()), nullable=True),
sa.Column(
"prediction_data_s3_object",
sqlmodel.sql.sqltypes.AutoString(),
nullable=True,
),
sa.Column(
"status",
modelevaluation_status_enum,
nullable=False,
server_default="pending",
),
sa.Column("error_message", sqlmodel.sql.sqltypes.AutoString(), nullable=True),
sa.Column("project_id", sa.Integer(), nullable=False),
sa.Column("organization_id", sa.Integer(), nullable=False),
sa.Column("is_deleted", sa.Boolean(), nullable=False),
sa.Column("inserted_at", sa.DateTime(), nullable=False),
sa.Column("updated_at", sa.DateTime(), nullable=False),
sa.Column("deleted_at", sa.DateTime(), nullable=True),
sa.ForeignKeyConstraint(
["document_id"],
["document.id"],
),
sa.ForeignKeyConstraint(
["fine_tuning_id"], ["fine_tuning.id"], ondelete="CASCADE"
),
sa.ForeignKeyConstraint(
["organization_id"], ["organization.id"], ondelete="CASCADE"
),
sa.ForeignKeyConstraint(["project_id"], ["project.id"], ondelete="CASCADE"),
sa.PrimaryKeyConstraint("id"),
)


def downgrade():
op.drop_table("model_evaluation")
op.drop_table("fine_tuning")
21 changes: 21 additions & 0 deletions backend/app/api/docs/fine_tuning/create.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
This endpoint initiates the fine-tuning of an OpenAI model using your custom dataset that you would have uploaded using the upload document endpoint. The uploaded dataset must include:

- A column named `query`, `question`, or `message` containing user inputs or messages.
- A column named `label` indicating whether a given message is a genuine query or not (e.g., casual conversation or small talk).

The split_ratio in the request body determines how your data is divided between training and testing. For example, a split ratio of 0.5 means 50% of your data will be used for training, and the remaining 50% for testing. You can also provide multiple split ratios—for instance, [0.7, 0.9]. This will trigger multiple fine-tuning jobs, one for each ratio, effectively training multiple models on different portions of your dataset. You would also need to specify a base model that you want to finetune.

The system_prompt field specified in the request body allows you to define an initial instruction or context-setting message that will be included in the training data. This message helps the model learn how it is expected to behave when responding to user inputs. It is prepended as the first message in each training example during fine-tuning.

The system handles the fine-tuning process by interacting with OpenAI's APIs under the hood. These include:

- [Openai File create to upload your training and testing files](https://platform.openai.com/docs/api-reference/files/create)

- [Openai Fine Tuning Job create to initiate each fine-tuning job](https://platform.openai.com/docs/api-reference/fine_tuning/create)

If successful, the response will include a message along with a list of fine-tuning jobs that were initiated. Each job object includes:

- id: the internal ID of the fine-tuning job
- document_id: the ID of the document used for fine-tuning
- split_ratio: the data split used for that job
- status: the initial status of the job (usually "pending")
5 changes: 5 additions & 0 deletions backend/app/api/docs/fine_tuning/retrieve.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Refreshes the status of a fine-tuning job by retrieving the latest information from OpenAI.
If there are any changes in status, fine-tuned model, or error message, the local job record is updated accordingly.
Returns the latest state of the job.

OpenAI’s job status is retrieved using their [Fine-tuning Job Retrieve API](https://platform.openai.com/docs/api-reference/fine_tuning/retrieve).
5 changes: 5 additions & 0 deletions backend/app/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
utils,
onboarding,
credentials,
fine_tuning,
model_evaluation,
)
from app.core.config import settings

Expand All @@ -38,6 +40,9 @@
api_router.include_router(threads.router)
api_router.include_router(users.router)
api_router.include_router(utils.router)
api_router.include_router(fine_tuning.router)
api_router.include_router(model_evaluation.router)


if settings.ENVIRONMENT in ["development", "testing"]:
api_router.include_router(private.router)
Loading