From 50fc87a02379e5f49768d41209329bbc367aa890 Mon Sep 17 00:00:00 2001 From: Niko Oliveira Date: Wed, 6 May 2026 16:03:28 -0700 Subject: [PATCH 01/11] Thread version_data through BundleInfo to worker-side bundle initialization Add version_data to the push path so structured bundle metadata (e.g., S3 manifests) reaches workers at task execution time. Changes: - Add version_data field to BundleInfo (workloads/base.py) - Populate version_data from DagVersion in ExecuteTask.make() - Add selectinload(TI.dag_version) to scheduler enqueue query to avoid N+1 queries when reading version_data - Add version_data parameter to BaseDagBundle.__init__ (stored as self.version_data) and DagBundlesManager.get_bundle() - Pass version_data through task_runner.py and callback_supervisor.py - Regenerate task-sdk datamodels to include version_data in BundleInfo Existing bundles ignore version_data (defaults to None). The S3 bundle will use self.version_data in initialize() to fetch specific object versions (follow-up PR). --- airflow-core/src/airflow/dag_processing/bundles/base.py | 2 ++ .../src/airflow/dag_processing/bundles/manager.py | 9 +++++++-- airflow-core/src/airflow/executors/workloads/base.py | 1 + airflow-core/src/airflow/executors/workloads/task.py | 4 ++++ airflow-core/src/airflow/jobs/scheduler_job_runner.py | 1 + task-sdk/src/airflow/sdk/api/datamodels/_generated.py | 1 + .../airflow/sdk/execution_time/callback_supervisor.py | 1 + task-sdk/src/airflow/sdk/execution_time/task_runner.py | 1 + 8 files changed, 18 insertions(+), 2 deletions(-) diff --git a/airflow-core/src/airflow/dag_processing/bundles/base.py b/airflow-core/src/airflow/dag_processing/bundles/base.py index b6b55f9251cfe..2b7a5734e76c7 100644 --- a/airflow-core/src/airflow/dag_processing/bundles/base.py +++ b/airflow-core/src/airflow/dag_processing/bundles/base.py @@ -304,10 +304,12 @@ def __init__( name: str, refresh_interval: int = conf.getint("dag_processor", "refresh_interval"), version: str | None = None, + version_data: dict | None = None, view_url_template: str | None = None, ) -> None: self.name = name self.version = version + self.version_data = version_data self.refresh_interval = refresh_interval self.is_initialized: bool = False diff --git a/airflow-core/src/airflow/dag_processing/bundles/manager.py b/airflow-core/src/airflow/dag_processing/bundles/manager.py index 78c54266eda9f..08c8ee8f8ebf3 100644 --- a/airflow-core/src/airflow/dag_processing/bundles/manager.py +++ b/airflow-core/src/airflow/dag_processing/bundles/manager.py @@ -395,19 +395,24 @@ def _extract_template_params(bundle_instance: BaseDagBundle) -> dict: return params - def get_bundle(self, name: str, version: str | None = None) -> BaseDagBundle: + def get_bundle( + self, name: str, version: str | None = None, version_data: dict | None = None + ) -> BaseDagBundle: """ Get a DAG bundle by name. :param name: The name of the DAG bundle. :param version: The version of the DAG bundle you need (optional). If not provided, ``tracking_ref`` will be used instead. + :param version_data: Optional structured data associated with this version (e.g., S3 manifest). :return: The DAG bundle. """ cfg_bundle = self._bundle_config.get(name) if not cfg_bundle: raise ValueError(f"Requested bundle '{name}' is not configured.") - return cfg_bundle.bundle_class(name=name, version=version, **cfg_bundle.kwargs) + return cfg_bundle.bundle_class( + name=name, version=version, version_data=version_data, **cfg_bundle.kwargs + ) def get_all_dag_bundles(self) -> Iterable[BaseDagBundle]: """ diff --git a/airflow-core/src/airflow/executors/workloads/base.py b/airflow-core/src/airflow/executors/workloads/base.py index 503cab7b3965a..a7f4d0c14622a 100644 --- a/airflow-core/src/airflow/executors/workloads/base.py +++ b/airflow-core/src/airflow/executors/workloads/base.py @@ -66,6 +66,7 @@ class BundleInfo(BaseModel): name: str version: str | None = None + version_data: dict | None = None class BaseWorkloadSchema(BaseModel): diff --git a/airflow-core/src/airflow/executors/workloads/task.py b/airflow-core/src/airflow/executors/workloads/task.py index b4bf02ea47b8d..c057abb5a351e 100644 --- a/airflow-core/src/airflow/executors/workloads/task.py +++ b/airflow-core/src/airflow/executors/workloads/task.py @@ -102,9 +102,13 @@ def make( ser_ti = TaskInstanceDTO.model_validate(ti, from_attributes=True) if not bundle_info: + version_data = None + if ti.dag_version is not None: + version_data = ti.dag_version.version_data bundle_info = BundleInfo( name=ti.dag_model.bundle_name, version=ti.dag_run.bundle_version, + version_data=version_data, ) fname = log_filename_template_renderer()(ti=ti) diff --git a/airflow-core/src/airflow/jobs/scheduler_job_runner.py b/airflow-core/src/airflow/jobs/scheduler_job_runner.py index 2f94d480eb6d1..292b9652e1fae 100644 --- a/airflow-core/src/airflow/jobs/scheduler_job_runner.py +++ b/airflow-core/src/airflow/jobs/scheduler_job_runner.py @@ -684,6 +684,7 @@ def _executable_task_instances_to_queued(self, max_tis: int, session: Session) - ranked_query.c.map_index_for_ordering, ) .options(selectinload(TI.dag_model)) + .options(selectinload(TI.dag_version)) ) query = query.limit(max_tis) diff --git a/task-sdk/src/airflow/sdk/api/datamodels/_generated.py b/task-sdk/src/airflow/sdk/api/datamodels/_generated.py index bc03569386d14..139632fc0a037 100644 --- a/task-sdk/src/airflow/sdk/api/datamodels/_generated.py +++ b/task-sdk/src/airflow/sdk/api/datamodels/_generated.py @@ -570,6 +570,7 @@ class BundleInfo(BaseModel): name: Annotated[str, Field(title="Name")] version: Annotated[str | None, Field(title="Version")] = None + version_data: Annotated[dict[str, Any] | None, Field(title="Version Data")] = None class TerminalTIState(str, Enum): diff --git a/task-sdk/src/airflow/sdk/execution_time/callback_supervisor.py b/task-sdk/src/airflow/sdk/execution_time/callback_supervisor.py index 12f86dec36bff..98216143cf22f 100644 --- a/task-sdk/src/airflow/sdk/execution_time/callback_supervisor.py +++ b/task-sdk/src/airflow/sdk/execution_time/callback_supervisor.py @@ -227,6 +227,7 @@ def _target(): bundle = DagBundlesManager().get_bundle( name=bundle_info.name, version=bundle_info.version, + version_data=bundle_info.version_data, ) bundle.initialize() if (bundle_path := str(bundle.path)) not in sys.path: diff --git a/task-sdk/src/airflow/sdk/execution_time/task_runner.py b/task-sdk/src/airflow/sdk/execution_time/task_runner.py index f3fee689928a0..ac8889daf0286 100644 --- a/task-sdk/src/airflow/sdk/execution_time/task_runner.py +++ b/task-sdk/src/airflow/sdk/execution_time/task_runner.py @@ -1001,6 +1001,7 @@ def parse(what: StartupDetails, log: Logger) -> RuntimeTaskInstance: bundle_instance = DagBundlesManager().get_bundle( name=bundle_info.name, version=bundle_info.version, + version_data=bundle_info.version_data, ) bundle_instance.initialize() _verify_bundle_access(bundle_instance, log) From 6e6ae90122c48c448af3cc2294e6c35711b03fb6 Mon Sep 17 00:00:00 2001 From: Niko Oliveira Date: Tue, 19 May 2026 17:14:44 -0700 Subject: [PATCH 02/11] Fix: add dag_version mock and update edge OpenAPI spec for version_data field --- .../unit/amazon/aws/executors/ecs/test_ecs_executor.py | 1 + .../providers/edge3/worker_api/v2-edge-generated.yaml | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/providers/amazon/tests/unit/amazon/aws/executors/ecs/test_ecs_executor.py b/providers/amazon/tests/unit/amazon/aws/executors/ecs/test_ecs_executor.py index ca6fab54fa2c3..de5a17acf9666 100644 --- a/providers/amazon/tests/unit/amazon/aws/executors/ecs/test_ecs_executor.py +++ b/providers/amazon/tests/unit/amazon/aws/executors/ecs/test_ecs_executor.py @@ -1309,6 +1309,7 @@ def test_try_adopt_task_instances(self, mock_executor): task.dag_model = mock.Mock() task.dag_model.bundle_name = "test_bundle" task.dag_model.relative_fileloc = "test_dag.py" + task.dag_version = mock.Mock(version_data=None) task.dag_run = mock.Mock() task.dag_run.bundle_version = "1.0.0" task.dag_run.context_carrier = {} diff --git a/providers/edge3/src/airflow/providers/edge3/worker_api/v2-edge-generated.yaml b/providers/edge3/src/airflow/providers/edge3/worker_api/v2-edge-generated.yaml index 7a7fb194fb03d..4bd6a68af4326 100644 --- a/providers/edge3/src/airflow/providers/edge3/worker_api/v2-edge-generated.yaml +++ b/providers/edge3/src/airflow/providers/edge3/worker_api/v2-edge-generated.yaml @@ -965,6 +965,12 @@ components: - type: string - type: 'null' title: Version + version_data: + anyOf: + - additionalProperties: true + type: object + - type: 'null' + title: Version Data type: object required: - name From 718c87caa88e46816e8e030f84811147200fc04e Mon Sep 17 00:00:00 2001 From: Niko Oliveira Date: Tue, 19 May 2026 17:51:36 -0700 Subject: [PATCH 03/11] Tighten version_data type to dict[str, Any] and add unit tests Address review feedback: - Use dict[str, Any] | None instead of bare dict | None for version_data in both BaseDagBundle.__init__ and BundleInfo - Add minimal tests verifying version_data plumbing through the bundle constructor --- .../src/airflow/dag_processing/bundles/base.py | 2 +- .../src/airflow/executors/workloads/base.py | 4 ++-- .../tests/unit/dag_processing/bundles/test_base.py | 13 +++++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/airflow-core/src/airflow/dag_processing/bundles/base.py b/airflow-core/src/airflow/dag_processing/bundles/base.py index 2b7a5734e76c7..4fde5750b39e4 100644 --- a/airflow-core/src/airflow/dag_processing/bundles/base.py +++ b/airflow-core/src/airflow/dag_processing/bundles/base.py @@ -304,7 +304,7 @@ def __init__( name: str, refresh_interval: int = conf.getint("dag_processor", "refresh_interval"), version: str | None = None, - version_data: dict | None = None, + version_data: dict[str, Any] | None = None, view_url_template: str | None = None, ) -> None: self.name = name diff --git a/airflow-core/src/airflow/executors/workloads/base.py b/airflow-core/src/airflow/executors/workloads/base.py index a7f4d0c14622a..8a480bcf41107 100644 --- a/airflow-core/src/airflow/executors/workloads/base.py +++ b/airflow-core/src/airflow/executors/workloads/base.py @@ -21,7 +21,7 @@ import os from abc import ABC, abstractmethod from collections.abc import Hashable -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from pydantic import BaseModel, ConfigDict, Field @@ -66,7 +66,7 @@ class BundleInfo(BaseModel): name: str version: str | None = None - version_data: dict | None = None + version_data: dict[str, Any] | None = None class BaseWorkloadSchema(BaseModel): diff --git a/airflow-core/tests/unit/dag_processing/bundles/test_base.py b/airflow-core/tests/unit/dag_processing/bundles/test_base.py index 6fc7ba39a0a12..f092f3e00e770 100644 --- a/airflow-core/tests/unit/dag_processing/bundles/test_base.py +++ b/airflow-core/tests/unit/dag_processing/bundles/test_base.py @@ -323,3 +323,16 @@ def test_bundle_version_inequality(self): bv1 = BundleVersion(version="abc", data={"key": "val"}) bv2 = BundleVersion(version="abc", data={"key": "other"}) assert bv1 != bv2 + + +def test_version_data_stored_on_bundle(): + """Test that version_data passed to a bundle constructor is stored on the instance.""" + manifest = {"schema_version": 1, "files": {"dags/my_dag.py": "S3VersionId123"}} + bundle = BasicBundle(name="test", version="abc", version_data=manifest) + assert bundle.version_data == manifest + + +def test_version_data_defaults_to_none(): + """Test that version_data defaults to None when not provided.""" + bundle = BasicBundle(name="test") + assert bundle.version_data is None From 38a2bde031558773b86174ded1d6685ac1c2a53e Mon Sep 17 00:00:00 2001 From: Niko Oliveira Date: Wed, 20 May 2026 13:59:16 -0700 Subject: [PATCH 04/11] Fix: guard CallbackKey dataclass test with AIRFLOW_V_3_3_PLUS for compat with 3.2.x The test_process_workloads_routes_execute_callback test uses CallbackKey(id=...) which requires the dataclass form introduced in 3.3. In Airflow 3.2.x, CallbackKey is a str type alias and does not accept keyword arguments. Change the skipif guard from AIRFLOW_V_3_2_PLUS to AIRFLOW_V_3_3_PLUS. --- .../celery/tests/unit/celery/executors/test_celery_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/providers/celery/tests/unit/celery/executors/test_celery_executor.py b/providers/celery/tests/unit/celery/executors/test_celery_executor.py index c11ea80a5baaf..6242f0ac10ce5 100644 --- a/providers/celery/tests/unit/celery/executors/test_celery_executor.py +++ b/providers/celery/tests/unit/celery/executors/test_celery_executor.py @@ -885,7 +885,7 @@ def test_celery_tasks_registered_on_import(): ) -@pytest.mark.skipif(not AIRFLOW_V_3_2_PLUS, reason="ExecuteCallback requires Airflow 3.2+") +@pytest.mark.skipif(not AIRFLOW_V_3_3_PLUS, reason="CallbackKey dataclass requires Airflow 3.3+") @pytest.mark.parametrize( ("callback_data", "expected_queue"), [ From bf949eb6f013b5fa9c8b640f9135e2ccece78f1a Mon Sep 17 00:00:00 2001 From: Niko Oliveira Date: Mon, 25 May 2026 16:30:18 -0700 Subject: [PATCH 05/11] Address review feedback: guard version_data for unpinned runs, fix serialization - Only populate version_data when dag_run.bundle_version is not None, mirroring the pinning rule for bundle_version (kaxil feedback) - Add model_serializer to BundleInfo so version_data is absent (not null) on the wire when None (ashb feedback) - Update edge OpenAPI spec: version_data is type:object, not anyOf with null - Add :param version_data: to BaseDagBundle docstring (kaxil feedback) - Remove unrelated changes from bad rebase (types.py, celery test) that were fixed separately in #66973 (ashb feedback) --- airflow-core/src/airflow/dag_processing/bundles/base.py | 2 ++ airflow-core/src/airflow/executors/workloads/base.py | 9 ++++++++- airflow-core/src/airflow/executors/workloads/task.py | 2 +- .../tests/unit/celery/executors/test_celery_executor.py | 2 +- .../providers/edge3/worker_api/v2-edge-generated.yaml | 6 ++---- 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/airflow-core/src/airflow/dag_processing/bundles/base.py b/airflow-core/src/airflow/dag_processing/bundles/base.py index 4fde5750b39e4..344a3349fecab 100644 --- a/airflow-core/src/airflow/dag_processing/bundles/base.py +++ b/airflow-core/src/airflow/dag_processing/bundles/base.py @@ -292,6 +292,8 @@ class BaseDagBundle(ABC): :param refresh_interval: How often the bundle should be refreshed from the source in seconds (Optional - defaults to [dag_processor] refresh_interval) :param version: Version of the DAG bundle (Optional) + :param version_data: Structured metadata for this bundle version, e.g. an S3 manifest. + Only populated for pinned runs (where dag_run.bundle_version is not None). (Optional) """ supports_versioning: bool = False diff --git a/airflow-core/src/airflow/executors/workloads/base.py b/airflow-core/src/airflow/executors/workloads/base.py index 8a480bcf41107..3dd5efbdfb82d 100644 --- a/airflow-core/src/airflow/executors/workloads/base.py +++ b/airflow-core/src/airflow/executors/workloads/base.py @@ -23,7 +23,7 @@ from collections.abc import Hashable from typing import TYPE_CHECKING, Any -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, model_serializer from airflow.configuration import conf @@ -68,6 +68,13 @@ class BundleInfo(BaseModel): version: str | None = None version_data: dict[str, Any] | None = None + @model_serializer(mode="wrap") + def _serialize(self, handler: Any) -> dict[str, Any]: + data = handler(self) + if data.get("version_data") is None: + data.pop("version_data", None) + return data + class BaseWorkloadSchema(BaseModel): """Base Pydantic schema for executor workload DTOs.""" diff --git a/airflow-core/src/airflow/executors/workloads/task.py b/airflow-core/src/airflow/executors/workloads/task.py index c057abb5a351e..611457f88a957 100644 --- a/airflow-core/src/airflow/executors/workloads/task.py +++ b/airflow-core/src/airflow/executors/workloads/task.py @@ -103,7 +103,7 @@ def make( ser_ti = TaskInstanceDTO.model_validate(ti, from_attributes=True) if not bundle_info: version_data = None - if ti.dag_version is not None: + if ti.dag_version is not None and ti.dag_run.bundle_version is not None: version_data = ti.dag_version.version_data bundle_info = BundleInfo( name=ti.dag_model.bundle_name, diff --git a/providers/celery/tests/unit/celery/executors/test_celery_executor.py b/providers/celery/tests/unit/celery/executors/test_celery_executor.py index 6242f0ac10ce5..c11ea80a5baaf 100644 --- a/providers/celery/tests/unit/celery/executors/test_celery_executor.py +++ b/providers/celery/tests/unit/celery/executors/test_celery_executor.py @@ -885,7 +885,7 @@ def test_celery_tasks_registered_on_import(): ) -@pytest.mark.skipif(not AIRFLOW_V_3_3_PLUS, reason="CallbackKey dataclass requires Airflow 3.3+") +@pytest.mark.skipif(not AIRFLOW_V_3_2_PLUS, reason="ExecuteCallback requires Airflow 3.2+") @pytest.mark.parametrize( ("callback_data", "expected_queue"), [ diff --git a/providers/edge3/src/airflow/providers/edge3/worker_api/v2-edge-generated.yaml b/providers/edge3/src/airflow/providers/edge3/worker_api/v2-edge-generated.yaml index 4bd6a68af4326..ca55130122596 100644 --- a/providers/edge3/src/airflow/providers/edge3/worker_api/v2-edge-generated.yaml +++ b/providers/edge3/src/airflow/providers/edge3/worker_api/v2-edge-generated.yaml @@ -966,10 +966,8 @@ components: - type: 'null' title: Version version_data: - anyOf: - - additionalProperties: true - type: object - - type: 'null' + additionalProperties: true + type: object title: Version Data type: object required: From a309b4a6c8cde025f7945528938bd4322f770520 Mon Sep 17 00:00:00 2001 From: Niko Oliveira Date: Mon, 25 May 2026 17:20:41 -0700 Subject: [PATCH 06/11] Fix: remove model_serializer that broke OpenAPI schema generation, regenerate artifacts The model_serializer(mode='wrap') on BundleInfo caused Pydantic to lose JSON schema information, making the OpenAPI generator produce a generic 'additionalProperties: true, type: object' instead of the full BundleInfo schema with name/version/version_data fields. Removing the custom serializer restores correct schema generation. The version_data field is Optional so receivers already handle null. Also regenerates: - edge OpenAPI spec (v2-edge-generated.yaml) - supervisor schema snapshot (schema.json) - uv.lock (reflects upstream dependency changes after rebase) --- airflow-core/src/airflow/executors/workloads/base.py | 9 +-------- .../providers/edge3/worker_api/v2-edge-generated.yaml | 6 ++++-- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/airflow-core/src/airflow/executors/workloads/base.py b/airflow-core/src/airflow/executors/workloads/base.py index 3dd5efbdfb82d..8a480bcf41107 100644 --- a/airflow-core/src/airflow/executors/workloads/base.py +++ b/airflow-core/src/airflow/executors/workloads/base.py @@ -23,7 +23,7 @@ from collections.abc import Hashable from typing import TYPE_CHECKING, Any -from pydantic import BaseModel, ConfigDict, Field, model_serializer +from pydantic import BaseModel, ConfigDict, Field from airflow.configuration import conf @@ -68,13 +68,6 @@ class BundleInfo(BaseModel): version: str | None = None version_data: dict[str, Any] | None = None - @model_serializer(mode="wrap") - def _serialize(self, handler: Any) -> dict[str, Any]: - data = handler(self) - if data.get("version_data") is None: - data.pop("version_data", None) - return data - class BaseWorkloadSchema(BaseModel): """Base Pydantic schema for executor workload DTOs.""" diff --git a/providers/edge3/src/airflow/providers/edge3/worker_api/v2-edge-generated.yaml b/providers/edge3/src/airflow/providers/edge3/worker_api/v2-edge-generated.yaml index ca55130122596..4bd6a68af4326 100644 --- a/providers/edge3/src/airflow/providers/edge3/worker_api/v2-edge-generated.yaml +++ b/providers/edge3/src/airflow/providers/edge3/worker_api/v2-edge-generated.yaml @@ -966,8 +966,10 @@ components: - type: 'null' title: Version version_data: - additionalProperties: true - type: object + anyOf: + - additionalProperties: true + type: object + - type: 'null' title: Version Data type: object required: From 6c1d65035a3a4fc474de6934c143c88981494e53 Mon Sep 17 00:00:00 2001 From: Niko Oliveira Date: Thu, 4 Jun 2026 19:55:31 -0700 Subject: [PATCH 07/11] Address kaxil review: type consistency, docs, inline comment, and make() tests --- .../airflow/dag_processing/bundles/manager.py | 4 +- .../src/airflow/executors/workloads/base.py | 8 +++ .../src/airflow/jobs/scheduler_job_runner.py | 3 + .../tests/unit/executors/test_workloads.py | 69 +++++++++++++++++++ .../schema/versions/v2026_06_16.py | 27 ++++++++ 5 files changed, 109 insertions(+), 2 deletions(-) create mode 100644 task-sdk/src/airflow/sdk/execution_time/schema/versions/v2026_06_16.py diff --git a/airflow-core/src/airflow/dag_processing/bundles/manager.py b/airflow-core/src/airflow/dag_processing/bundles/manager.py index 08c8ee8f8ebf3..7d966e0b5074f 100644 --- a/airflow-core/src/airflow/dag_processing/bundles/manager.py +++ b/airflow-core/src/airflow/dag_processing/bundles/manager.py @@ -20,7 +20,7 @@ import logging import os import warnings -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from itsdangerous import URLSafeSerializer from pydantic import BaseModel, ValidationError @@ -396,7 +396,7 @@ def _extract_template_params(bundle_instance: BaseDagBundle) -> dict: return params def get_bundle( - self, name: str, version: str | None = None, version_data: dict | None = None + self, name: str, version: str | None = None, version_data: dict[str, Any] | None = None ) -> BaseDagBundle: """ Get a DAG bundle by name. diff --git a/airflow-core/src/airflow/executors/workloads/base.py b/airflow-core/src/airflow/executors/workloads/base.py index 8a480bcf41107..fc37e80b9a135 100644 --- a/airflow-core/src/airflow/executors/workloads/base.py +++ b/airflow-core/src/airflow/executors/workloads/base.py @@ -67,6 +67,14 @@ class BundleInfo(BaseModel): name: str version: str | None = None version_data: dict[str, Any] | None = None + """Optional structured metadata for this bundle version (e.g., an S3 object manifest). + + This field is serialized on every workload payload sent through executor channels + (Celery/Redis, SQS, K8s pod annotations, etc.). Keep payloads small — ideally under + 64 KB — to avoid hitting message-size limits. Bundles with large version metadata + should store a reference (e.g., a DB row ID or pre-signed URL) and fetch on the + worker side rather than inlining the full payload here. + """ class BaseWorkloadSchema(BaseModel): diff --git a/airflow-core/src/airflow/jobs/scheduler_job_runner.py b/airflow-core/src/airflow/jobs/scheduler_job_runner.py index 292b9652e1fae..37a32f1a5398e 100644 --- a/airflow-core/src/airflow/jobs/scheduler_job_runner.py +++ b/airflow-core/src/airflow/jobs/scheduler_job_runner.py @@ -684,6 +684,9 @@ def _executable_task_instances_to_queued(self, max_tis: int, session: Session) - ranked_query.c.map_index_for_ordering, ) .options(selectinload(TI.dag_model)) + # Eager-load dag_version: TIs become transient (via make_transient) before + # ExecuteTask.make() reads ti.dag_version.version_data. Lazy loads on + # transient objects silently return None instead of raising DetachedInstanceError. .options(selectinload(TI.dag_version)) ) diff --git a/airflow-core/tests/unit/executors/test_workloads.py b/airflow-core/tests/unit/executors/test_workloads.py index a063b91140dfb..768365276764e 100644 --- a/airflow-core/tests/unit/executors/test_workloads.py +++ b/airflow-core/tests/unit/executors/test_workloads.py @@ -171,3 +171,72 @@ def test_workload_ti_round_trips_through_sdk_generated_model(): assert received.queue == "jdk-17" assert received.map_index == 3 assert not hasattr(received, "pool_slots") + + +class TestExecuteTaskMakeVersionData: + """Tests for ExecuteTask.make() threading version_data through BundleInfo.""" + + @staticmethod + def _make_mock_ti(bundle_version, version_data): + """Build a mock TI with the attributes ExecuteTask.make() reads.""" + from unittest.mock import Mock + + ti_id = uuid4() + dag_version_id = uuid4() + + ti = Mock() + ti.id = ti_id + ti.dag_version_id = dag_version_id + ti.task_id = "test_task" + ti.dag_id = "test_dag" + ti.run_id = "test_run" + ti.try_number = 1 + ti.map_index = -1 + ti.pool_slots = 1 + ti.queue = "default" + ti.priority_weight = 1 + ti.executor_config = None + ti.parent_context_carrier = None + ti.context_carrier = None + ti.external_executor_id = None + + ti.dag_model.bundle_name = "test-bundle" + ti.dag_model.relative_fileloc = "dags/test_dag.py" + + ti.dag_run.bundle_version = bundle_version + + if version_data is not None: + ti.dag_version.version_data = version_data + else: + ti.dag_version = None + + return ti + + def test_pinned_run_populates_version_data(self, monkeypatch): + """When dag_run.bundle_version is set, version_data from dag_version flows to BundleInfo.""" + monkeypatch.setattr( + "airflow.utils.helpers.log_filename_template_renderer", + lambda: lambda **kwargs: "test.log", + ) + + version_data = {"schema_version": 1, "files": {"dags/my_dag.py": "ver123"}} + ti = self._make_mock_ti(bundle_version="abc123", version_data=version_data) + + workload = ExecuteTask.make(ti) + + assert workload.bundle_info.version == "abc123" + assert workload.bundle_info.version_data == version_data + + def test_unpinned_run_version_data_is_none(self, monkeypatch): + """When dag_run.bundle_version is None (unpinned), version_data must be None.""" + monkeypatch.setattr( + "airflow.utils.helpers.log_filename_template_renderer", + lambda: lambda **kwargs: "test.log", + ) + + ti = self._make_mock_ti(bundle_version=None, version_data=None) + + workload = ExecuteTask.make(ti) + + assert workload.bundle_info.version is None + assert workload.bundle_info.version_data is None diff --git a/task-sdk/src/airflow/sdk/execution_time/schema/versions/v2026_06_16.py b/task-sdk/src/airflow/sdk/execution_time/schema/versions/v2026_06_16.py new file mode 100644 index 0000000000000..011f074652a2b --- /dev/null +++ b/task-sdk/src/airflow/sdk/execution_time/schema/versions/v2026_06_16.py @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Supervisor schema changes for 2026-06-16. + +Changes in this version: +- Add ``version_data`` (dict[str, Any] | None, default None) to BundleInfo. + No VersionChange needed: this is the first (oldest) version in the bundle + and the field has a null default, so older runtimes that omit it are + forward-compatible. +""" + +from __future__ import annotations From 4bfd3042408a6275d8d6803fc27ae6f801f224de Mon Sep 17 00:00:00 2001 From: Niko Oliveira Date: Fri, 5 Jun 2026 15:14:11 -0700 Subject: [PATCH 08/11] Simplify version_data docstring: raise soft cap to 256KB --- airflow-core/src/airflow/executors/workloads/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/airflow-core/src/airflow/executors/workloads/base.py b/airflow-core/src/airflow/executors/workloads/base.py index fc37e80b9a135..a3631244d2cec 100644 --- a/airflow-core/src/airflow/executors/workloads/base.py +++ b/airflow-core/src/airflow/executors/workloads/base.py @@ -71,9 +71,7 @@ class BundleInfo(BaseModel): This field is serialized on every workload payload sent through executor channels (Celery/Redis, SQS, K8s pod annotations, etc.). Keep payloads small — ideally under - 64 KB — to avoid hitting message-size limits. Bundles with large version metadata - should store a reference (e.g., a DB row ID or pre-signed URL) and fetch on the - worker side rather than inlining the full payload here. + 256 KB — to avoid hitting message-size limits. """ From d63c1e63e9e7d35315564e6d16125dbb6a7b6baf Mon Sep 17 00:00:00 2001 From: Niko Oliveira Date: Tue, 9 Jun 2026 10:38:18 -0700 Subject: [PATCH 09/11] Fix: update callback_supervisor test to expect version_data parameter --- airflow-core/tests/unit/executors/test_workloads.py | 1 + .../airflow/sdk/execution_time/schema/schema.json | 13 +++++++++++++ .../execution_time/test_callback_supervisor.py | 1 + 3 files changed, 15 insertions(+) diff --git a/airflow-core/tests/unit/executors/test_workloads.py b/airflow-core/tests/unit/executors/test_workloads.py index 768365276764e..bb4e371f90cf0 100644 --- a/airflow-core/tests/unit/executors/test_workloads.py +++ b/airflow-core/tests/unit/executors/test_workloads.py @@ -198,6 +198,7 @@ def _make_mock_ti(bundle_version, version_data): ti.executor_config = None ti.parent_context_carrier = None ti.context_carrier = None + ti.hostname = None ti.external_executor_id = None ti.dag_model.bundle_name = "test-bundle" diff --git a/task-sdk/src/airflow/sdk/execution_time/schema/schema.json b/task-sdk/src/airflow/sdk/execution_time/schema/schema.json index 087d149d1af03..ae736e5d6621e 100644 --- a/task-sdk/src/airflow/sdk/execution_time/schema/schema.json +++ b/task-sdk/src/airflow/sdk/execution_time/schema/schema.json @@ -434,6 +434,19 @@ ], "default": null, "title": "Version" + }, + "version_data": { + "anyOf": [ + { + "additionalProperties": true, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Version Data" } }, "required": [ diff --git a/task-sdk/tests/task_sdk/execution_time/test_callback_supervisor.py b/task-sdk/tests/task_sdk/execution_time/test_callback_supervisor.py index c33299b313c81..2c7a7ce0f6bb1 100644 --- a/task-sdk/tests/task_sdk/execution_time/test_callback_supervisor.py +++ b/task-sdk/tests/task_sdk/execution_time/test_callback_supervisor.py @@ -494,6 +494,7 @@ def test_execute_callback_with_bundle_info_should_pass_correct_parameters( mock_bundle_setup["manager"].get_bundle.assert_called_once_with( name=bundle_info.name, version=bundle_info.version, + version_data=bundle_info.version_data, ) mock_bundle_setup["bundle"].initialize.assert_called_once() From d5dc38896e35799787ece9a86ef88e3ae57ea782 Mon Sep 17 00:00:00 2001 From: Niko Oliveira Date: Tue, 23 Jun 2026 14:48:24 -0700 Subject: [PATCH 10/11] Fixes from Kaxil's latest round of reviews Scope the scheduler's dag_version eager-load with load_only(version_data) so the batched SELECT reads two columns instead of the full row. Drop the empty supervisor schema version file, which carried no VersionChange and only existed to satisfy the version-bump hook. Also declare version_data on the _BundleInfoLike Protocol and fix the BundleInfo.version_data docstring. --- .../src/airflow/executors/workloads/base.py | 6 +- .../src/airflow/jobs/scheduler_job_runner.py | 4 +- .../tests/unit/executors/test_workloads.py | 55 +++++++++++-------- .../sdk/execution_time/callback_supervisor.py | 3 +- .../schema/versions/v2026_06_16.py | 27 --------- 5 files changed, 40 insertions(+), 55 deletions(-) delete mode 100644 task-sdk/src/airflow/sdk/execution_time/schema/versions/v2026_06_16.py diff --git a/airflow-core/src/airflow/executors/workloads/base.py b/airflow-core/src/airflow/executors/workloads/base.py index a3631244d2cec..41334d68f3038 100644 --- a/airflow-core/src/airflow/executors/workloads/base.py +++ b/airflow-core/src/airflow/executors/workloads/base.py @@ -69,9 +69,9 @@ class BundleInfo(BaseModel): version_data: dict[str, Any] | None = None """Optional structured metadata for this bundle version (e.g., an S3 object manifest). - This field is serialized on every workload payload sent through executor channels - (Celery/Redis, SQS, K8s pod annotations, etc.). Keep payloads small — ideally under - 256 KB — to avoid hitting message-size limits. + This field is serialized on every workload payload — executor command-line argv for + K8s/ECS/Batch/Lambda, message body for Celery/SQS. Keep payloads small to avoid hitting + transport limits (ARG_MAX is ~128 KB on Linux; the etcd PodSpec ceiling is ~1.5 MB). """ diff --git a/airflow-core/src/airflow/jobs/scheduler_job_runner.py b/airflow-core/src/airflow/jobs/scheduler_job_runner.py index 37a32f1a5398e..df2abe2aabd06 100644 --- a/airflow-core/src/airflow/jobs/scheduler_job_runner.py +++ b/airflow-core/src/airflow/jobs/scheduler_job_runner.py @@ -687,7 +687,9 @@ def _executable_task_instances_to_queued(self, max_tis: int, session: Session) - # Eager-load dag_version: TIs become transient (via make_transient) before # ExecuteTask.make() reads ti.dag_version.version_data. Lazy loads on # transient objects silently return None instead of raising DetachedInstanceError. - .options(selectinload(TI.dag_version)) + # Scope the second SELECT to version_data (the PK is auto-included) so we read + # two columns rather than the full DagVersion row. + .options(selectinload(TI.dag_version).load_only(DagVersion.version_data)) ) query = query.limit(max_tis) diff --git a/airflow-core/tests/unit/executors/test_workloads.py b/airflow-core/tests/unit/executors/test_workloads.py index bb4e371f90cf0..63a249a36fd1b 100644 --- a/airflow-core/tests/unit/executors/test_workloads.py +++ b/airflow-core/tests/unit/executors/test_workloads.py @@ -176,17 +176,26 @@ def test_workload_ti_round_trips_through_sdk_generated_model(): class TestExecuteTaskMakeVersionData: """Tests for ExecuteTask.make() threading version_data through BundleInfo.""" + @pytest.fixture(autouse=True) + def _stub_log_template(self, monkeypatch): + monkeypatch.setattr( + "airflow.utils.helpers.log_filename_template_renderer", + lambda: lambda **kwargs: "test.log", + ) + @staticmethod - def _make_mock_ti(bundle_version, version_data): - """Build a mock TI with the attributes ExecuteTask.make() reads.""" - from unittest.mock import Mock + def _make_mock_ti(bundle_version, version_data, *, has_dag_version=True): + """Build a mock TI with the attributes ExecuteTask.make() reads. - ti_id = uuid4() - dag_version_id = uuid4() + ``has_dag_version`` controls whether the TI has an associated DagVersion + (legacy/backfilled TIs may not), independently of ``version_data`` so the + pin-guard can be exercised with version_data present on an unpinned run. + """ + from unittest.mock import Mock ti = Mock() - ti.id = ti_id - ti.dag_version_id = dag_version_id + ti.id = uuid4() + ti.dag_version_id = uuid4() ti.task_id = "test_task" ti.dag_id = "test_dag" ti.run_id = "test_run" @@ -206,20 +215,15 @@ def _make_mock_ti(bundle_version, version_data): ti.dag_run.bundle_version = bundle_version - if version_data is not None: + if has_dag_version: ti.dag_version.version_data = version_data else: ti.dag_version = None return ti - def test_pinned_run_populates_version_data(self, monkeypatch): - """When dag_run.bundle_version is set, version_data from dag_version flows to BundleInfo.""" - monkeypatch.setattr( - "airflow.utils.helpers.log_filename_template_renderer", - lambda: lambda **kwargs: "test.log", - ) - + def test_pinned_run_populates_version_data(self): + """When the run is pinned, version_data from dag_version flows to BundleInfo.""" version_data = {"schema_version": 1, "files": {"dags/my_dag.py": "ver123"}} ti = self._make_mock_ti(bundle_version="abc123", version_data=version_data) @@ -228,16 +232,21 @@ def test_pinned_run_populates_version_data(self, monkeypatch): assert workload.bundle_info.version == "abc123" assert workload.bundle_info.version_data == version_data - def test_unpinned_run_version_data_is_none(self, monkeypatch): - """When dag_run.bundle_version is None (unpinned), version_data must be None.""" - monkeypatch.setattr( - "airflow.utils.helpers.log_filename_template_renderer", - lambda: lambda **kwargs: "test.log", - ) - - ti = self._make_mock_ti(bundle_version=None, version_data=None) + def test_unpinned_run_suppresses_present_version_data(self): + """An unpinned run must not expose version_data even when the dag_version carries it.""" + version_data = {"schema_version": 1, "files": {"dags/my_dag.py": "ver123"}} + ti = self._make_mock_ti(bundle_version=None, version_data=version_data) workload = ExecuteTask.make(ti) assert workload.bundle_info.version is None assert workload.bundle_info.version_data is None + + def test_missing_dag_version_yields_none(self): + """A pinned run whose TI has no dag_version (legacy/backfilled) yields no version_data.""" + ti = self._make_mock_ti(bundle_version="abc123", version_data=None, has_dag_version=False) + + workload = ExecuteTask.make(ti) + + assert workload.bundle_info.version == "abc123" + assert workload.bundle_info.version_data is None diff --git a/task-sdk/src/airflow/sdk/execution_time/callback_supervisor.py b/task-sdk/src/airflow/sdk/execution_time/callback_supervisor.py index 98216143cf22f..9830771701293 100644 --- a/task-sdk/src/airflow/sdk/execution_time/callback_supervisor.py +++ b/task-sdk/src/airflow/sdk/execution_time/callback_supervisor.py @@ -25,7 +25,7 @@ from importlib import import_module from importlib.util import module_from_spec, spec_from_file_location from pathlib import Path -from typing import TYPE_CHECKING, Annotated, BinaryIO, ClassVar, Protocol +from typing import TYPE_CHECKING, Annotated, Any, BinaryIO, ClassVar, Protocol from uuid import UUID import attrs @@ -67,6 +67,7 @@ class _BundleInfoLike(Protocol): name: str version: str | None + version_data: dict[str, Any] | None __all__ = ["CallbackSubprocess", "supervise_callback"] diff --git a/task-sdk/src/airflow/sdk/execution_time/schema/versions/v2026_06_16.py b/task-sdk/src/airflow/sdk/execution_time/schema/versions/v2026_06_16.py deleted file mode 100644 index 011f074652a2b..0000000000000 --- a/task-sdk/src/airflow/sdk/execution_time/schema/versions/v2026_06_16.py +++ /dev/null @@ -1,27 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Supervisor schema changes for 2026-06-16. - -Changes in this version: -- Add ``version_data`` (dict[str, Any] | None, default None) to BundleInfo. - No VersionChange needed: this is the first (oldest) version in the bundle - and the field has a null default, so older runtimes that omit it are - forward-compatible. -""" - -from __future__ import annotations From 703510db226cfa6c1b869f74c2011504ddbe09c6 Mon Sep 17 00:00:00 2001 From: Niko Oliveira Date: Wed, 24 Jun 2026 15:36:24 -0700 Subject: [PATCH 11/11] Set version_data on mock TI in batch executor adopt test --- .../tests/unit/amazon/aws/executors/batch/test_batch_executor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/providers/amazon/tests/unit/amazon/aws/executors/batch/test_batch_executor.py b/providers/amazon/tests/unit/amazon/aws/executors/batch/test_batch_executor.py index cf929e48d148d..211ed64acf604 100644 --- a/providers/amazon/tests/unit/amazon/aws/executors/batch/test_batch_executor.py +++ b/providers/amazon/tests/unit/amazon/aws/executors/batch/test_batch_executor.py @@ -814,6 +814,7 @@ def test_try_adopt_task_instances(self, mock_executor): task.dag_model = mock.Mock() task.dag_model.bundle_name = "test_bundle" task.dag_model.relative_fileloc = "test_dag.py" + task.dag_version = mock.Mock(version_data=None) task.dag_run = mock.Mock() task.dag_run.bundle_version = "1.0.0" task.dag_run.context_carrier = {}