-
Notifications
You must be signed in to change notification settings - Fork 17.3k
AIP-76: Propagate partition_date to consumers of partitioned assets
#67285
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Propagate ``partition_date`` from producer DagRuns to consumers of partitioned assets, so date-shaped partitions are available in consumer task templates. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -49,6 +49,8 @@ | |
| from airflow.utils.sqlalchemy import get_dialect_name, with_row_locks | ||
|
|
||
| if TYPE_CHECKING: | ||
| from datetime import datetime | ||
|
|
||
| from sqlalchemy.orm.session import Session | ||
|
|
||
| from airflow.models.dag import DagModel | ||
|
|
@@ -274,6 +276,7 @@ def register_asset_change( | |
| source_alias_names: Collection[str] = (), | ||
| session: Session, | ||
| partition_key: str | None = None, | ||
| partition_date: datetime | None = None, | ||
| source_is_api: bool = False, | ||
| api_user_teams: set[str] | None = None, | ||
| api_allow_consumer_teams: list[str] | None = None, | ||
|
|
@@ -394,6 +397,7 @@ def register_asset_change( | |
| source_map_index=asset_event.source_map_index, | ||
| source_aliases=[aam.to_serialized() for aam in asset_alias_models], | ||
| partition_key=partition_key, | ||
| partition_date=partition_date, | ||
| ) | ||
| ) | ||
|
|
||
|
|
@@ -440,6 +444,7 @@ def register_asset_change( | |
| asset_id=asset_model.id, | ||
| dags_to_queue=dags_to_queue, | ||
| partition_key=partition_key, | ||
| partition_date=partition_date, | ||
| event=asset_event, | ||
| task_instance=task_instance, | ||
| session=session, | ||
|
|
@@ -485,6 +490,7 @@ def _queue_dagruns( | |
| asset_id: int, | ||
| dags_to_queue: set[DagModel], | ||
| partition_key: str | None, | ||
| partition_date: datetime | None, | ||
| event: AssetEvent, | ||
| task_instance: TaskInstance | None, | ||
| session: Session, | ||
|
|
@@ -499,6 +505,7 @@ def _queue_dagruns( | |
| partition_dags=partition_dags, | ||
| event=event, | ||
| partition_key=partition_key, | ||
| partition_date=partition_date, | ||
| task_instance=task_instance, | ||
| session=session, | ||
| ) | ||
|
|
@@ -527,6 +534,7 @@ def _queue_partitioned_dags( | |
| partition_dags: Iterable[DagModel], | ||
| event: AssetEvent, | ||
| partition_key: str | None, | ||
| partition_date: datetime | None, | ||
| task_instance: TaskInstance | None, | ||
| session: Session, | ||
| ) -> None: | ||
|
|
@@ -574,9 +582,9 @@ def _queue_partitioned_dags( | |
| if (asset_model := session.scalar(select(AssetModel).where(AssetModel.id == asset_id))) is None: | ||
| raise RuntimeError(f"Could not find asset for asset_id={asset_id}") | ||
|
|
||
| mapper = timetable.get_partition_mapper(name=asset_model.name, uri=asset_model.uri) | ||
| try: | ||
| # We'll need to catch every possible exception happen when mapping partition_key. | ||
| mapper = timetable.get_partition_mapper(name=asset_model.name, uri=asset_model.uri) | ||
| target_key = mapper.to_downstream(partition_key) | ||
| except Exception as err: | ||
| log.exception( | ||
|
|
@@ -643,9 +651,18 @@ def _queue_partitioned_dags( | |
| ) | ||
| continue | ||
|
|
||
| # The producer's partition_date (threaded in from its DagRun via | ||
| # register_asset_change) is carried onto the APDR only by mappers that | ||
| # opt in. IdentityMapper does, since its key carries no temporal meaning | ||
| # for the scheduler to re-derive at run creation; temporal and composite | ||
| # mappers return None here and are resolved from the key by the scheduler | ||
| # via PartitionMapper.to_partition_date. | ||
| target_partition_date: datetime | None = mapper.carry_partition_date(partition_date) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
|
||
| for target_key in target_keys: | ||
| apdr = cls._get_or_create_apdr( | ||
| target_key=target_key, | ||
| target_partition_date=target_partition_date, | ||
| target_dag=target_dag, | ||
| rollup_fingerprint=fingerprint, | ||
| asset_id=asset_id, | ||
|
|
@@ -666,6 +683,7 @@ def _get_or_create_apdr( | |
| cls, | ||
| *, | ||
| target_key: str, | ||
| target_partition_date: datetime | None, | ||
| target_dag: DagModel, | ||
| rollup_fingerprint: dict, | ||
| asset_id: int, | ||
|
|
@@ -683,6 +701,20 @@ def _get_or_create_apdr( | |
| ``rollup_fingerprint`` is the serialized mapper / window definition for all partitioned | ||
| assets in the timetable at creation time; the scheduler discards APDRs whose stamp no | ||
| longer matches the current timetable's fingerprint (mapper / window may have changed). | ||
|
|
||
| Reconciling the carried ``partition_date`` on an existing pending APDR is best-effort: | ||
| a partitioned consumer's feeding assets are expected to agree on the partition's | ||
| datetime. The carry only matters for ``IdentityMapper`` (whose key the scheduler | ||
| cannot decode); temporal/composite feeds re-derive the date from the key at run | ||
| creation regardless of what is stored here. Within that contract: | ||
|
|
||
| - If the APDR carries no date yet (``None`` — created by an event that carried none), | ||
| adopt the incoming date when this event carries one. There is nothing to conflict | ||
| with, so a later identity event's date is not dropped. | ||
| - If the APDR already carries a date and this event carries a **different** non-null | ||
| one, the producing assets disagree; picking one would be order-dependent, so the | ||
| carried date is suppressed to ``None`` (and re-adoptable by a later event). | ||
| - Otherwise (the dates agree, or this event carries none) the existing value is kept. | ||
| """ | ||
| with _lock_asset_model(session=session, asset_id=asset_id): | ||
| latest_apdr: AssetPartitionDagRun | None = session.scalar( | ||
|
|
@@ -695,6 +727,29 @@ def _get_or_create_apdr( | |
| .limit(1) | ||
| ) | ||
| if latest_apdr and latest_apdr.created_dag_run_id is None: | ||
| existing_partition_date = latest_apdr.partition_date | ||
| if existing_partition_date is None: | ||
| # No carried date yet; adopt the incoming one if present (no conflict | ||
| # to resolve). Keeps a later identity event's date from being dropped. | ||
| if target_partition_date is not None: | ||
| latest_apdr.partition_date = target_partition_date | ||
| session.flush() | ||
| elif target_partition_date is not None and existing_partition_date != target_partition_date: | ||
| # Two contributing events carry conflicting partition_dates for the same | ||
| # (target_key, target_dag). Choosing one would be order-dependent, so | ||
| # suppress: the consumer DagRun gets partition_date=None rather than a | ||
| # wrong, unstable value. | ||
| log.warning( | ||
| "Conflicting partition_date carried for the same target key; " | ||
| "suppressing it so the consumer DagRun's partition_date is None. " | ||
| "The producing assets likely disagree on the partition's datetime.", | ||
| target_dag_id=target_dag.dag_id, | ||
| target_key=target_key, | ||
| existing_partition_date=existing_partition_date, | ||
| incoming_partition_date=target_partition_date, | ||
| ) | ||
| latest_apdr.partition_date = None | ||
|
nathadfield marked this conversation as resolved.
|
||
| session.flush() | ||
| cls.logger().debug( | ||
| "Existing APDR found for key %s dag_id %s", | ||
| target_key, | ||
|
|
@@ -707,6 +762,7 @@ def _get_or_create_apdr( | |
| target_dag_id=target_dag.dag_id, | ||
| created_dag_run_id=None, | ||
| partition_key=target_key, | ||
| partition_date=target_partition_date, | ||
| rollup_fingerprint=rollup_fingerprint, | ||
| ) | ||
| session.add(apdr) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.