From 7c16f8afd2d23ea141453de0ecd5b22dd1e1dbab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 15:01:17 +0200 Subject: [PATCH 01/80] =?UTF-8?q?feat:=20per-period=20Icechunk=20ingest=20?= =?UTF-8?q?=E2=80=94=20protocol,=20orchestrator,=20ERA5-Land=20plugin?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the plugin architecture and orchestration loop from issue #64. ### What's new **climate_api/ingest/** — new package: - protocol.py: GridSpec dataclass and IngestionPlugin Protocol - orchestrator.py: async run_ingest() loop with bounded concurrency, per-batch commits, cursor-based resume, and cancel checkpoint - store.py: Icechunk repository open/create helpers and read_committed_period_ids() for store-based resume fallback - plugins/era5_land.py: Era5LandPlugin — streams hourly ERA5-Land data from DestinE Earth Data Hub via remote zarr; handles 0-360 longitude, per-month parallel fetches, and availability lag **Dataset registry** — ingestion.plugin is now a valid alternative to ingestion.function; datasets can carry both keys during migration. **era5_land.yaml** — registers Era5LandPlugin alongside the existing function key so both paths are available. **ingestions/services.py** — branches on ingestion.plugin: routes ingest requests through run_ingest_sync and registers the Icechunk store as an ArtifactFormat.ICECHUNK artifact. **accessor.py** — adds coverage_from_open_dataset() for callers that already hold a store handle (e.g. an Icechunk session). **pyproject.toml** — adds icechunk>=2.0,<3 dependency. 14 tests covering: write all periods, fetch-exactly-once, idempotency, cursor-based resume, progress reporting, batch commit checkpoints, no-op on empty period list, cancellation, sync wrapper, load_plugin, and read_committed_period_ids. Closes: partially addresses #64 (Tracks 2–4 remain) --- climate_api/data/datasets/era5_land.yaml | 6 + .../data_accessor/services/accessor.py | 9 + .../data_registry/services/datasets.py | 10 +- climate_api/ingest/__init__.py | 6 + climate_api/ingest/orchestrator.py | 181 ++++++++++ climate_api/ingest/plugins/__init__.py | 1 + climate_api/ingest/plugins/era5_land.py | 156 +++++++++ climate_api/ingest/protocol.py | 74 ++++ climate_api/ingest/store.py | 50 +++ climate_api/ingestions/schemas.py | 1 + climate_api/ingestions/services.py | 99 +++++- pyproject.toml | 1 + tests/test_ingest_orchestrator.py | 326 ++++++++++++++++++ 13 files changed, 917 insertions(+), 3 deletions(-) create mode 100644 climate_api/ingest/__init__.py create mode 100644 climate_api/ingest/orchestrator.py create mode 100644 climate_api/ingest/plugins/__init__.py create mode 100644 climate_api/ingest/plugins/era5_land.py create mode 100644 climate_api/ingest/protocol.py create mode 100644 climate_api/ingest/store.py create mode 100644 tests/test_ingest_orchestrator.py diff --git a/climate_api/data/datasets/era5_land.yaml b/climate_api/data/datasets/era5_land.yaml index 14708404..e193fe61 100644 --- a/climate_api/data/datasets/era5_land.yaml +++ b/climate_api/data/datasets/era5_land.yaml @@ -16,6 +16,9 @@ begin: "1950-01-01" resolution: PT1H ingestion: + plugin: climate_api.ingest.plugins.era5_land.Era5LandPlugin + params: + variable: t2m function: dhis2eo.data.destine.era5_land.hourly.download default_params: variables: ['t2m'] @@ -47,6 +50,9 @@ begin: "1950-01-01" resolution: PT1H ingestion: + plugin: climate_api.ingest.plugins.era5_land.Era5LandPlugin + params: + variable: tp function: dhis2eo.data.destine.era5_land.hourly.download default_params: variables: ['tp'] diff --git a/climate_api/data_accessor/services/accessor.py b/climate_api/data_accessor/services/accessor.py index b3a31b92..b770791d 100644 --- a/climate_api/data_accessor/services/accessor.py +++ b/climate_api/data_accessor/services/accessor.py @@ -129,6 +129,15 @@ def _open_zarr(zarr_path: str) -> xr.Dataset: return xr.open_zarr(zarr_path, consolidated=None) # type: ignore[no-any-return] +def coverage_from_open_dataset(ds: xr.Dataset, *, period_type: str, native_crs: str = "EPSG:4326") -> dict[str, Any]: + """Summarize temporal and spatial coverage for a caller-managed open dataset. + + Unlike get_data_coverage_for_paths, this function does not close the dataset. + Use when the caller already holds a store handle (e.g. an Icechunk session store). + """ + return _coverage_from_dataset(ds=ds, period_type=period_type, native_crs=native_crs) + + def _coverage_from_dataset(*, ds: xr.Dataset, period_type: str, native_crs: str = "EPSG:4326") -> dict[str, Any]: """Summarize temporal and spatial coverage for an already opened dataset.""" if any(size == 0 for size in ds.sizes.values()): diff --git a/climate_api/data_registry/services/datasets.py b/climate_api/data_registry/services/datasets.py index a0487f48..acc3d51d 100644 --- a/climate_api/data_registry/services/datasets.py +++ b/climate_api/data_registry/services/datasets.py @@ -153,8 +153,14 @@ def _validate_dataset_template(dataset: object, *, source: str) -> None: if not isinstance(ingestion, dict): raise ValueError(f"Dataset template '{dataset_id}' in {source} must define an 'ingestion' block") function = ingestion.get("function") - if not isinstance(function, str) or not function: - raise ValueError(f"Dataset template '{dataset_id}' in {source} must define ingestion.function") + plugin = ingestion.get("plugin") + has_function = isinstance(function, str) and function + has_plugin = isinstance(plugin, str) and plugin + if not has_function and not has_plugin: + raise ValueError( + f"Dataset template '{dataset_id}' in {source} must define either " + "ingestion.function (legacy download path) or ingestion.plugin (per-period Icechunk ingest)" + ) sync_availability = sync_block.get("availability") if isinstance(sync_block, dict) else None if sync_availability is not None: diff --git a/climate_api/ingest/__init__.py b/climate_api/ingest/__init__.py new file mode 100644 index 00000000..f79c90ce --- /dev/null +++ b/climate_api/ingest/__init__.py @@ -0,0 +1,6 @@ +"""Per-period Icechunk ingest — protocol, orchestrator, and built-in plugins.""" + +from climate_api.ingest.orchestrator import run_ingest, run_ingest_sync +from climate_api.ingest.protocol import GridSpec, IngestionPlugin + +__all__ = ["GridSpec", "IngestionPlugin", "run_ingest", "run_ingest_sync"] diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py new file mode 100644 index 00000000..0c8f793a --- /dev/null +++ b/climate_api/ingest/orchestrator.py @@ -0,0 +1,181 @@ +"""Per-period Icechunk ingest orchestrator. + +The orchestrator is the only place that writes to the Icechunk store. +Plugins implement three focused async methods (probe / periods / fetch_period) +and never touch zarr directly. + +Crash recovery: each commit advances the job cursor. On restart the +orchestrator reads the cursor, skips already-committed periods, and continues +from where it stopped. A crash loses at most one uncommitted batch. +""" + +from __future__ import annotations + +import asyncio +import importlib +import logging +from collections.abc import Callable +from pathlib import Path +from typing import Any + +import xarray as xr + +from climate_api.ingest.protocol import GridSpec, IngestionPlugin +from climate_api.ingest.store import open_or_create_repo, read_committed_period_ids + +logger = logging.getLogger(__name__) + + +def load_plugin(dotted_path: str, params: dict[str, Any]) -> IngestionPlugin: + """Instantiate an IngestionPlugin from a dotted import path and YAML params. + + The class is imported from dotted_path and called with **params. Built-in + plugins accept variable and other source-specific kwargs; custom plugins + define their own __init__ signature. + """ + module_path, _, class_name = dotted_path.rpartition(".") + if not module_path: + raise ValueError(f"Invalid plugin path '{dotted_path}': must be 'module.ClassName'") + module = importlib.import_module(module_path) + cls = getattr(module, class_name) + plugin = cls(**params) + if not isinstance(plugin, IngestionPlugin): + raise TypeError(f"{dotted_path} does not implement IngestionPlugin") + return plugin + + +async def run_ingest( + *, + plugin: IngestionPlugin, + params: dict[str, Any], + bbox: list[float], + start: str, + end: str, + store_path: Path, + period_type: str, + on_progress: Callable[..., None] | None = None, + is_cancel_requested: Callable[[], bool] | None = None, + save_cursor: Callable[[dict[str, Any]], None] | None = None, + load_cursor: Callable[[], dict[str, Any] | None] | None = None, +) -> None: + """Probe the source then stream per-period data into an Icechunk store. + + On the first run creates the store. On resume continues from the last + committed period recorded in the job cursor (falling back to reading the + store's committed time coordinates when no cursor is present). + + Memory usage is bounded by plugin.max_concurrency datasets held in flight + concurrently. Writes are always sequential: tasks are awaited in + chronological order so the time axis stays sorted. + """ + spec: GridSpec = await plugin.probe(bbox, **params) + logger.info("Probe: shape=%s crs=EPSG:%d time_dim=%s", spec.shape, spec.crs, spec.time_dim) + + all_periods = await plugin.periods(start, end) + if not all_periods: + logger.info("No periods available for range %s..%s", start, end) + return + + # Determine pending periods: prefer cursor (fast) then fall back to store read. + cursor = load_cursor() if load_cursor else None + last_committed: str | None = cursor.get("last_committed") if cursor else None + + if last_committed and last_committed in all_periods: + idx = all_periods.index(last_committed) + 1 + pending = all_periods[idx:] + logger.info("Resuming after %s: %d/%d periods remain", last_committed, len(pending), len(all_periods)) + else: + present = read_committed_period_ids(store_path, period_type) + pending = [p for p in all_periods if p not in present] + already_done = len(all_periods) - len(pending) + logger.info("Periods: %d already committed, %d pending", already_done, len(pending)) + + if not pending: + logger.info("Store is current — nothing to ingest") + return + + done_offset = len(all_periods) - len(pending) + if on_progress: + on_progress(done=done_offset, total=len(all_periods), message=f"{len(pending)} periods pending") + + is_first_write = not store_path.exists() + repo = open_or_create_repo(store_path) + + semaphore = asyncio.Semaphore(plugin.max_concurrency) + + async def _fetch(period_id: str) -> xr.Dataset: + async with semaphore: + return await plugin.fetch_period(period_id, bbox, **params) + + # Create all tasks upfront so up to max_concurrency fetches start immediately. + # Await in chronological order so writes are always sequential. + tasks = [asyncio.create_task(_fetch(p)) for p in pending] + + session = repo.writable_session("main") + + for i, task in enumerate(tasks): + if is_cancel_requested and is_cancel_requested(): + for t in tasks[i:]: + t.cancel() + from climate_api.jobs.models import JobCancelledError + + raise JobCancelledError("Ingest cancelled between periods") + + ds = await task + period_id = pending[i] + + if not spec.time_dim: + # Static dataset: single write, no append dimension. + ds.to_zarr(session.store, mode="w") + elif i == 0 and is_first_write: + ds.to_zarr(session.store, mode="w") + else: + ds.to_zarr(session.store, append_dim="time") + + should_commit = (i + 1) % plugin.commit_batch_size == 0 or (i + 1) == len(pending) + if should_commit: + session.commit(f"ingest up to {period_id}") + logger.info("Committed: up to %s (%d/%d)", period_id, i + 1, len(pending)) + if save_cursor: + save_cursor({"last_committed": period_id}) + if (i + 1) < len(pending): + # Fresh writable session for the next batch. + session = repo.writable_session("main") + + if on_progress: + on_progress(done=done_offset + i + 1, total=len(all_periods), message=f"Wrote {period_id}") + + if not spec.time_dim: + break + + +def run_ingest_sync( + *, + plugin: IngestionPlugin, + params: dict[str, Any], + bbox: list[float], + start: str, + end: str, + store_path: Path, + period_type: str, + on_progress: Callable[..., None] | None = None, + is_cancel_requested: Callable[[], bool] | None = None, + save_cursor: Callable[[dict[str, Any]], None] | None = None, + load_cursor: Callable[[], dict[str, Any] | None] | None = None, +) -> None: + """Synchronous wrapper around run_ingest for use in threaded job workers.""" + asyncio.run( + run_ingest( + plugin=plugin, + params=params, + bbox=bbox, + start=start, + end=end, + store_path=store_path, + period_type=period_type, + on_progress=on_progress, + is_cancel_requested=is_cancel_requested, + save_cursor=save_cursor, + load_cursor=load_cursor, + ) + ) diff --git a/climate_api/ingest/plugins/__init__.py b/climate_api/ingest/plugins/__init__.py new file mode 100644 index 00000000..04686344 --- /dev/null +++ b/climate_api/ingest/plugins/__init__.py @@ -0,0 +1 @@ +"""Built-in IngestionPlugin implementations.""" diff --git a/climate_api/ingest/plugins/era5_land.py b/climate_api/ingest/plugins/era5_land.py new file mode 100644 index 00000000..97af25a3 --- /dev/null +++ b/climate_api/ingest/plugins/era5_land.py @@ -0,0 +1,156 @@ +"""ERA5-Land IngestionPlugin — streams hourly data from DestinE Earth Data Hub. + +Authentication via .netrc (Unix) or _netrc (Windows). Register a free account +at https://earthdatahub.destine.eu/getting-started to obtain credentials. + +The DestinE ERA5-Land zarr store uses 0–360 longitudes (not −180–180). +This plugin corrects the longitude range before returning data so all stored +periods share a consistent coordinate system. +""" + +from __future__ import annotations + +import asyncio +import calendar +import logging +from concurrent.futures import ThreadPoolExecutor +from datetime import date, timedelta +from typing import Any + +import numpy as np +import xarray as xr + +from climate_api.ingest.protocol import GridSpec + +logger = logging.getLogger(__name__) + +_DESTINE_ZARR_URL = "https://data.earthdatahub.destine.eu/era5/reanalysis-era5-land-no-antartica-v0.zarr" +_STORAGE_OPTIONS = {"client_kwargs": {"trust_env": True}} + +# ERA5-Land on DestinE has roughly a 15-day publication lag. +_LAG_DAYS = 15 + +# Thread pool shared across probe/fetch calls so async methods don't block the +# event loop while waiting for remote I/O. +_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="era5land") + + +class Era5LandPlugin: + """IngestionPlugin for ERA5-Land hourly data from DestinE Earth Data Hub. + + Args: + variable: ERA5-Land variable short name (e.g. 't2m', 'tp'). + """ + + max_concurrency = 4 + commit_batch_size = 720 # one month of hourly periods + + def __init__(self, variable: str) -> None: + self.variable = variable + + # ------------------------------------------------------------------ + # Protocol implementation + # ------------------------------------------------------------------ + + async def probe(self, bbox: list[float], **_: Any) -> GridSpec: + """Open the remote zarr metadata-only and return the grid spec for bbox.""" + return await asyncio.get_running_loop().run_in_executor(_executor, self._probe_sync, bbox) + + async def periods(self, start: str, end: str) -> list[str]: + """Return hourly period IDs available within the provider's lag window.""" + return self._build_periods(start, end) + + async def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: + """Fetch one hourly period from the remote zarr store.""" + return await asyncio.get_running_loop().run_in_executor( + _executor, self._fetch_sync, period_id, bbox + ) + + # ------------------------------------------------------------------ + # Sync helpers (run inside the thread pool) + # ------------------------------------------------------------------ + + def _open_remote(self) -> xr.Dataset: + return xr.open_dataset( + _DESTINE_ZARR_URL, + engine="zarr", + storage_options=_STORAGE_OPTIONS, + chunks={}, + )[[self.variable]] + + def _correct_longitude(self, ds: xr.Dataset) -> xr.Dataset: + """Unwrap 0–360 longitude to −180–180 and sort.""" + return ds.assign_coords(longitude=((ds.longitude + 180) % 360 - 180)).sortby("longitude") + + def _select_bbox(self, ds: xr.Dataset, bbox: list[float]) -> xr.Dataset: + xmin, ymin, xmax, ymax = map(float, bbox) + lon_res = float(abs(ds.longitude.diff("longitude").median())) + lat_res = float(abs(ds.latitude.diff("latitude").median())) + return ds.sel( + longitude=slice(xmin - lon_res, xmax + lon_res), + latitude=slice(ymax + lat_res, ymin - lat_res), + ) + + def _probe_sync(self, bbox: list[float]) -> GridSpec: + ds = self._open_remote() + ds = self._correct_longitude(ds) + ds = self._select_bbox(ds, bbox) + da = ds[self.variable] + ny = da.sizes["latitude"] + nx = da.sizes["longitude"] + return GridSpec( + shape=(ny, nx), + crs=4326, + dtype=np.dtype(da.dtype), + nodata=None, + time_dim=True, + x_dim="longitude", + y_dim="latitude", + ) + + def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: + """Fetch one hourly period: remote zarr → bbox clip → load → return.""" + hour = int(period_id[-2:]) if len(period_id) > 10 else 0 + date_part = period_id[:10] + + ds = self._open_remote() + ds = self._correct_longitude(ds) + ds = self._select_bbox(ds, bbox) + ds = ds.sel(valid_time=f"{date_part}T{hour:02d}") + + # Ensure a length-1 time dimension so append_dim="time" works correctly. + if "valid_time" in ds.dims: + ds = ds.rename({"valid_time": "time"}) + elif "valid_time" in ds.coords and "time" not in ds.dims: + ds = ds.expand_dims("time").assign_coords(time=[ds.valid_time.values]) + + ds = ds.rename({"longitude": "x", "latitude": "y"}) + return ds.load() + + # ------------------------------------------------------------------ + # Period generation + # ------------------------------------------------------------------ + + def _build_periods(self, start: str, end: str) -> list[str]: + """Generate hourly period IDs, clamped to the provider's availability lag.""" + cutoff = date.today() - timedelta(days=_LAG_DAYS) + start_dt = date.fromisoformat(start[:10]) + end_dt = min(date.fromisoformat(end[:10]), cutoff) + + periods: list[str] = [] + current = start_dt + while current <= end_dt: + _, last_day = calendar.monthrange(current.year, current.month) + for day in range(current.day if current == start_dt else 1, last_day + 1): + d = current.replace(day=day) + if d > end_dt: + break + for hour in range(24): + periods.append(f"{d.isoformat()}T{hour:02d}") + # Advance to first day of next month + if current.month == 12: + current = current.replace(year=current.year + 1, month=1, day=1) + else: + current = current.replace(month=current.month + 1, day=1) + + return periods diff --git a/climate_api/ingest/protocol.py b/climate_api/ingest/protocol.py new file mode 100644 index 00000000..42ecdee2 --- /dev/null +++ b/climate_api/ingest/protocol.py @@ -0,0 +1,74 @@ +"""Plugin protocol and shared data types for per-period Icechunk ingest.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable + +import numpy as np + +if TYPE_CHECKING: + import xarray as xr + + +@dataclass +class GridSpec: + """Source grid metadata returned by a plugin probe. + + The orchestrator uses this to fix the zarr chunk shape and write GeoZarr + attributes before the first period is written. Set time_dim=False for + static (time-invariant) datasets — the orchestrator branches on this flag + and issues a single write with no append dimension. + """ + + shape: tuple[int, int] + crs: int + dtype: np.dtype + nodata: float | None = None + time_dim: bool = True + x_dim: str = "x" + y_dim: str = "y" + attrs: dict[str, Any] = field(default_factory=dict) + + +@runtime_checkable +class IngestionPlugin(Protocol): + """Minimal interface a plugin must implement for per-period Icechunk ingest. + + The climate-api layer owns the orchestration loop — plugins never touch + zarr or Icechunk directly. Implement the three async methods and declare + max_concurrency and commit_batch_size as class attributes. + + max_concurrency: maximum number of fetch_period calls in flight at once. + Keep at 1 for sources with large per-period files or rate-limited APIs. + Raise for sources where individual periods are small (< 50 MB). + + commit_batch_size: number of periods written between Icechunk commits. + Use 1 for monthly sources. For daily sources use ~30; for hourly ~720. + This controls crash-recovery granularity, not peak memory — to_zarr + flushes each period immediately. + """ + + max_concurrency: int + commit_batch_size: int + + async def probe(self, bbox: list[float], **params: Any) -> GridSpec: + """Metadata-only source probe. Returns grid spec. No data transfer.""" + ... + + async def periods(self, start: str, end: str) -> list[str]: + """Return the ordered list of available period IDs from start to end. + + May query the upstream source to confirm which periods are published. + The orchestrator uses the length of this list for progress reporting. + """ + ... + + async def fetch_period(self, period_id: str, bbox: list[float], **params: Any) -> "xr.Dataset": + """Fetch one period. Return a dataset in the source CRS. + + The returned dataset must have a 'time' dimension with a single + coordinate value. Spatial dimensions must match spec.x_dim / spec.y_dim. + The orchestrator handles zarr writes — never call to_zarr here. + """ + ... diff --git a/climate_api/ingest/store.py b/climate_api/ingest/store.py new file mode 100644 index 00000000..3210e2e1 --- /dev/null +++ b/climate_api/ingest/store.py @@ -0,0 +1,50 @@ +"""Icechunk store lifecycle helpers.""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import icechunk + +logger = logging.getLogger(__name__) + + +def open_or_create_repo(store_path: Path) -> "icechunk.Repository": + """Open an existing Icechunk repository or create one at store_path.""" + import icechunk + + storage = icechunk.local_filesystem_storage(str(store_path)) + if store_path.exists(): + return icechunk.Repository.open(storage) + return icechunk.Repository.create(storage) + + +def read_committed_period_ids(store_path: Path, period_type: str) -> set[str]: + """Return the set of period IDs already committed to the Icechunk store. + + Reads the time coordinate from the last committed snapshot and converts + each timestamp back to a period string using the dataset's period_type. + Returns an empty set when the store does not yet exist or has no time dim. + """ + import xarray as xr + + from climate_api.shared.time import datetime_to_period_string + + if not store_path.exists(): + return set() + + try: + repo = open_or_create_repo(store_path) + session = repo.readonly_session("main") + ds = xr.open_zarr(session.store) + if "time" not in ds.coords: + return set() + import pandas as pd + + return {datetime_to_period_string(pd.Timestamp(t.item()).to_pydatetime(), period_type) for t in ds.time} + except Exception: + logger.debug("Could not read committed periods from %s", store_path, exc_info=True) + return set() diff --git a/climate_api/ingestions/schemas.py b/climate_api/ingestions/schemas.py index eb3d7292..4f49253e 100644 --- a/climate_api/ingestions/schemas.py +++ b/climate_api/ingestions/schemas.py @@ -11,6 +11,7 @@ class ArtifactFormat(StrEnum): ZARR = "zarr" NETCDF = "netcdf" + ICECHUNK = "icechunk" class PublicationStatus(StrEnum): diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 04fbe709..98eb829b 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -18,7 +18,7 @@ from starlette.responses import Response from climate_api import config as api_config -from climate_api.data_accessor.services.accessor import get_data_coverage_for_paths +from climate_api.data_accessor.services.accessor import coverage_from_open_dataset, get_data_coverage_for_paths from climate_api.data_manager.services import downloader from climate_api.data_registry.services import datasets as registry_datasets from climate_api.extents.services import get_extent @@ -198,6 +198,17 @@ def create_artifact( return publish_artifact_record(existing.artifact_id) return existing + ingestion = dataset.get("ingestion") or {} + if isinstance(ingestion, dict) and ingestion.get("plugin"): + return _create_icechunk_artifact( + dataset=dataset, + start=start, + end=resolved_download_end, + bbox=bbox, + request_scope=request_scope, + publish=publish, + ) + logger.info( "Downloading dataset '%s': request_scope=%s..%s download_scope=%s..%s prefer_zarr=%s publish=%s", dataset["id"], @@ -318,6 +329,92 @@ def create_artifact( return stored_record +def _create_icechunk_artifact( + *, + dataset: dict[str, object], + start: str, + end: str, + bbox: list[float] | None, + request_scope: ArtifactRequestScope, + publish: bool, +) -> ArtifactRecord: + """Run per-period Icechunk ingest and register the resulting store as an artifact.""" + from climate_api.ingest.orchestrator import load_plugin, run_ingest_sync + from climate_api.ingest.store import open_or_create_repo + + dataset_id = str(dataset["id"]) + period_type = str(dataset["period_type"]) + ingestion = dict(dataset.get("ingestion") or {}) # type: ignore[arg-type] + plugin_path = str(ingestion["plugin"]) + params = dict(ingestion.get("params") or {}) + + extent = get_extent() + resolved_bbox: list[float] = list(bbox) if bbox is not None else ( + list(extent["bbox"]) if extent else [-180, -90, 180, 90] + ) + store_path = downloader.DOWNLOAD_DIR / f"{dataset_id}.icechunk" + + plugin = load_plugin(plugin_path, params) + + logger.info("Running Icechunk ingest for '%s': %s..%s", dataset_id, start, end) + run_ingest_sync( + plugin=plugin, + params=params, + bbox=resolved_bbox, + start=start, + end=end, + store_path=store_path, + period_type=period_type, + ) + + repo = open_or_create_repo(store_path) + session = repo.readonly_session("main") + import xarray as xr + + ds = xr.open_zarr(session.store) + from climate_api import config as api_config + + native_crs = api_config.get_crs() or "EPSG:4326" + coverage_data = coverage_from_open_dataset(ds, period_type=period_type, native_crs=native_crs) + ds.close() + + if not coverage_data.get("has_data", True): + raise HTTPException(status_code=409, detail="Icechunk store contains no data for the requested scope") + + _spatial_wgs84_data = coverage_data["coverage"].get("spatial_wgs84") + coverage = ArtifactCoverage( + temporal=CoverageTemporal(**coverage_data["coverage"]["temporal"]), + spatial=CoverageSpatial(**coverage_data["coverage"]["spatial"]), + spatial_wgs84=CoverageSpatial(**_spatial_wgs84_data) if _spatial_wgs84_data else None, + ) + + record = ArtifactRecord( + artifact_id=str(uuid4()), + dataset_id=dataset_id, + dataset_name=str(dataset["name"]), + variable=str(dataset["variable"]), + format=ArtifactFormat.ICECHUNK, + path=str(store_path.resolve()), + asset_paths=[str(store_path.resolve())], + variables=[str(dataset["variable"])], + request_scope=request_scope, + coverage=coverage, + created_at=datetime.now(UTC), + publication=ArtifactPublication(), + ) + stored = _store_artifact_record(record, prefer_zarr=False, publish=publish) + logger.info( + "Stored Icechunk artifact '%s' for '%s': coverage=%s..%s", + stored.artifact_id, + dataset_id, + stored.coverage.temporal.start, + stored.coverage.temporal.end, + ) + if publish and stored.publication.status != PublicationStatus.PUBLISHED: + return publish_artifact_record(stored.artifact_id) + return stored + + def publish_artifact_record(artifact_id: str) -> ArtifactRecord: """Publish an artifact via pygeoapi and persist publication metadata.""" published = publish_artifact(get_artifact_or_404(artifact_id)) diff --git a/pyproject.toml b/pyproject.toml index 8c96f96f..5ad36fe3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ dependencies = [ "rioxarray>=0.17", "portalocker>=3.2.0", "dhis2eo>=1.2.1", + "icechunk>=2.0,<3", ] [project.urls] diff --git a/tests/test_ingest_orchestrator.py b/tests/test_ingest_orchestrator.py new file mode 100644 index 00000000..0a3717d6 --- /dev/null +++ b/tests/test_ingest_orchestrator.py @@ -0,0 +1,326 @@ +"""Tests for the per-period Icechunk ingest orchestrator. + +All tests use FakePlugin — no network access required. +The Icechunk store is written to a pytest tmp_path directory. +""" + +from __future__ import annotations + +import asyncio +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd +import pytest +import xarray as xr + +from climate_api.ingest.orchestrator import load_plugin, run_ingest, run_ingest_sync +from climate_api.ingest.protocol import GridSpec, IngestionPlugin +from climate_api.ingest.store import read_committed_period_ids + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_monthly_dataset(period_id: str, ny: int = 4, nx: int = 4) -> xr.Dataset: + """Return a tiny single-period dataset matching FakePlugin's grid.""" + t = pd.Timestamp(f"{period_id}-01") + data = np.random.default_rng(42).random((1, ny, nx)).astype("float32") + return xr.Dataset( + {"temperature": xr.DataArray(data, dims=["time", "y", "x"])}, + coords={"time": [t]}, + ) + + +# --------------------------------------------------------------------------- +# Fake plugin +# --------------------------------------------------------------------------- + +class FakePlugin: + """In-memory IngestionPlugin that generates tiny xarray Datasets.""" + + max_concurrency = 2 + commit_batch_size = 2 + + def __init__(self, periods: list[str]) -> None: + self._periods = periods + self.fetched: list[str] = [] + + async def probe(self, bbox: list[float], **params: Any) -> GridSpec: + return GridSpec(shape=(4, 4), crs=4326, dtype=np.dtype("float32"), nodata=None) + + async def periods(self, start: str, end: str) -> list[str]: + return [p for p in self._periods if start <= p <= end] + + async def fetch_period(self, period_id: str, bbox: list[float], **params: Any) -> xr.Dataset: + self.fetched.append(period_id) + return _make_monthly_dataset(period_id) + + +# --------------------------------------------------------------------------- +# Protocol conformance +# --------------------------------------------------------------------------- + +def test_fake_plugin_satisfies_protocol() -> None: + plugin = FakePlugin(["2024-01", "2024-02"]) + assert isinstance(plugin, IngestionPlugin) + + +# --------------------------------------------------------------------------- +# Core orchestrator tests +# --------------------------------------------------------------------------- + +def test_run_ingest_writes_all_periods(tmp_path: Path) -> None: + plugin = FakePlugin(["2024-01", "2024-02", "2024-03"]) + store_path = tmp_path / "test.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-03", + store_path=store_path, + period_type="monthly", + ) + ) + + assert store_path.exists() + committed = read_committed_period_ids(store_path, "monthly") + assert committed == {"2024-01", "2024-02", "2024-03"} + + +def test_run_ingest_fetches_every_period_exactly_once(tmp_path: Path) -> None: + plugin = FakePlugin(["2024-01", "2024-02", "2024-03"]) + store_path = tmp_path / "test.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-03", + store_path=store_path, + period_type="monthly", + ) + ) + + assert sorted(plugin.fetched) == ["2024-01", "2024-02", "2024-03"] + + +def test_run_ingest_is_idempotent(tmp_path: Path) -> None: + plugin = FakePlugin(["2024-01", "2024-02"]) + store_path = tmp_path / "test.icechunk" + + for _ in range(2): + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-02", + store_path=store_path, + period_type="monthly", + ) + ) + + # Second run fetched nothing new. + assert plugin.fetched == ["2024-01", "2024-02"] + committed = read_committed_period_ids(store_path, "monthly") + assert committed == {"2024-01", "2024-02"} + + +def test_run_ingest_resumes_from_cursor(tmp_path: Path) -> None: + """Simulate a crash after the first batch (2024-01, 2024-02) and resume.""" + plugin = FakePlugin(["2024-01", "2024-02", "2024-03", "2024-04"]) + store_path = tmp_path / "test.icechunk" + + # First run writes all four periods but we stop with a cursor pointing at batch 1. + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-02", + store_path=store_path, + period_type="monthly", + ) + ) + assert read_committed_period_ids(store_path, "monthly") == {"2024-01", "2024-02"} + + # Resume: provide a cursor pointing at the last committed period. + cursor: dict[str, Any] = {"last_committed": "2024-02"} + plugin2 = FakePlugin(["2024-01", "2024-02", "2024-03", "2024-04"]) + + asyncio.run( + run_ingest( + plugin=plugin2, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-04", + store_path=store_path, + period_type="monthly", + load_cursor=lambda: cursor, + ) + ) + + # Only the two new periods were fetched. + assert sorted(plugin2.fetched) == ["2024-03", "2024-04"] + committed = read_committed_period_ids(store_path, "monthly") + assert committed == {"2024-01", "2024-02", "2024-03", "2024-04"} + + +def test_run_ingest_progress_reporting(tmp_path: Path) -> None: + plugin = FakePlugin(["2024-01", "2024-02", "2024-03"]) + store_path = tmp_path / "test.icechunk" + reports: list[dict[str, Any]] = [] + + def on_progress(done: int, total: int, message: str) -> None: + reports.append({"done": done, "total": total}) + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-03", + store_path=store_path, + period_type="monthly", + on_progress=on_progress, + ) + ) + + totals = {r["total"] for r in reports} + assert totals == {3} + final = max(r["done"] for r in reports) + assert final == 3 + + +def test_run_ingest_cursor_saved_after_each_batch(tmp_path: Path) -> None: + """commit_batch_size=2 → cursor is saved after periods 2 and 4.""" + plugin = FakePlugin(["2024-01", "2024-02", "2024-03", "2024-04"]) + store_path = tmp_path / "test.icechunk" + saved_cursors: list[dict[str, Any]] = [] + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-04", + store_path=store_path, + period_type="monthly", + save_cursor=saved_cursors.append, + ) + ) + + assert len(saved_cursors) == 2 + assert saved_cursors[0]["last_committed"] == "2024-02" + assert saved_cursors[1]["last_committed"] == "2024-04" + + +def test_run_ingest_noop_when_no_periods(tmp_path: Path) -> None: + plugin = FakePlugin([]) + store_path = tmp_path / "test.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-03", + store_path=store_path, + period_type="monthly", + ) + ) + + assert not store_path.exists() + + +def test_run_ingest_cancels_on_request(tmp_path: Path) -> None: + from climate_api.jobs.models import JobCancelledError + + # Plugin with more periods than the batch — cancellation hits mid-run. + plugin = FakePlugin(["2024-01", "2024-02", "2024-03", "2024-04", "2024-05", "2024-06"]) + store_path = tmp_path / "test.icechunk" + + call_count = 0 + + def cancel_after_two() -> bool: + nonlocal call_count + call_count += 1 + # First check (before period 0) → False; subsequent checks → True after first batch. + return call_count > 1 + + with pytest.raises(JobCancelledError): + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-06", + store_path=store_path, + period_type="monthly", + is_cancel_requested=cancel_after_two, + ) + ) + + +# --------------------------------------------------------------------------- +# Sync wrapper +# --------------------------------------------------------------------------- + +def test_run_ingest_sync_wrapper(tmp_path: Path) -> None: + plugin = FakePlugin(["2024-01", "2024-02"]) + store_path = tmp_path / "test.icechunk" + + run_ingest_sync( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-02", + store_path=store_path, + period_type="monthly", + ) + + assert read_committed_period_ids(store_path, "monthly") == {"2024-01", "2024-02"} + + +# --------------------------------------------------------------------------- +# load_plugin +# --------------------------------------------------------------------------- + +def test_load_plugin_imports_and_instantiates(tmp_path: Path) -> None: + """load_plugin can resolve built-in plugins by dotted path.""" + plugin = load_plugin("climate_api.ingest.plugins.era5_land.Era5LandPlugin", {"variable": "t2m"}) + assert isinstance(plugin, IngestionPlugin) + assert plugin.max_concurrency >= 1 # type: ignore[attr-defined] + + +def test_load_plugin_raises_for_invalid_path() -> None: + with pytest.raises(ValueError, match="Invalid plugin path"): + load_plugin("NotADottedPath", {}) + + +def test_load_plugin_raises_for_non_protocol() -> None: + with pytest.raises(TypeError, match="does not implement IngestionPlugin"): + load_plugin("builtins.dict", {}) + + +# --------------------------------------------------------------------------- +# read_committed_period_ids +# --------------------------------------------------------------------------- + +def test_read_committed_period_ids_empty_when_no_store(tmp_path: Path) -> None: + assert read_committed_period_ids(tmp_path / "nostore.icechunk", "monthly") == set() From 28337be7577a9326f478d6d778d37c79170e61fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 16:12:18 +0200 Subject: [PATCH 02/80] fix: per-period commit and hourly period filter in ERA5-Land plugin Two bugs found during Norway instance testing: 1. Icechunk 2.x sessions do not expose uncommitted writes to subsequent zarr.open_group calls, so to_zarr(append_dim='time') on period N+1 saw an empty store. Fix: open a fresh writable session per period so each append reads the prior committed snapshot. commit_batch_size now controls cursor-save frequency rather than commit frequency. 2. Era5LandPlugin._build_periods ignored the hour component of start/end, so 'start=2024-01-01T00 end=2024-01-01T00' generated all 24 hours. Fix: compare full period-ID strings (lexicographic) after generating. Add three unit tests for the period-filter fix. --- climate_api/ingest/orchestrator.py | 36 ++++++++++++++----------- climate_api/ingest/plugins/era5_land.py | 22 ++++++++++----- tests/test_ingest_orchestrator.py | 28 +++++++++++++++++++ 3 files changed, 64 insertions(+), 22 deletions(-) diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py index 0c8f793a..c6c97eff 100644 --- a/climate_api/ingest/orchestrator.py +++ b/climate_api/ingest/orchestrator.py @@ -4,9 +4,10 @@ Plugins implement three focused async methods (probe / periods / fetch_period) and never touch zarr directly. -Crash recovery: each commit advances the job cursor. On restart the -orchestrator reads the cursor, skips already-committed periods, and continues -from where it stopped. A crash loses at most one uncommitted batch. +Crash recovery: every period is committed individually. The cursor is saved +every commit_batch_size periods so that a restart resumes from the last +cursor checkpoint. A crash loses at most commit_batch_size periods of +re-fetch work (the store itself is always in a valid committed state). """ from __future__ import annotations @@ -111,8 +112,6 @@ async def _fetch(period_id: str) -> xr.Dataset: # Await in chronological order so writes are always sequential. tasks = [asyncio.create_task(_fetch(p)) for p in pending] - session = repo.writable_session("main") - for i, task in enumerate(tasks): if is_cancel_requested and is_cancel_requested(): for t in tasks[i:]: @@ -124,23 +123,30 @@ async def _fetch(period_id: str) -> xr.Dataset: ds = await task period_id = pending[i] + # Each period uses its own writable session so that to_zarr(append_dim=) + # on the next period reads the committed store and finds the time axis. + # Icechunk 2.x sessions do not expose uncommitted writes to subsequent + # zarr.open_group calls, so batching writes within one session breaks the + # append — committing per period is the correct pattern. + session = repo.writable_session("main") + if not spec.time_dim: - # Static dataset: single write, no append dimension. ds.to_zarr(session.store, mode="w") elif i == 0 and is_first_write: ds.to_zarr(session.store, mode="w") else: ds.to_zarr(session.store, append_dim="time") - should_commit = (i + 1) % plugin.commit_batch_size == 0 or (i + 1) == len(pending) - if should_commit: - session.commit(f"ingest up to {period_id}") - logger.info("Committed: up to %s (%d/%d)", period_id, i + 1, len(pending)) - if save_cursor: - save_cursor({"last_committed": period_id}) - if (i + 1) < len(pending): - # Fresh writable session for the next batch. - session = repo.writable_session("main") + session.commit(f"ingest: {period_id}") + + # Save cursor at commit_batch_size intervals and at the end. + # commit_batch_size controls resume granularity (cursor save frequency), + # not commit frequency — every period is committed for correctness. + if save_cursor and ((i + 1) % plugin.commit_batch_size == 0 or (i + 1) == len(pending)): + save_cursor({"last_committed": period_id}) + logger.info("Cursor saved: up to %s (%d/%d)", period_id, i + 1, len(pending)) + + logger.debug("Committed: %s (%d/%d)", period_id, i + 1, len(pending)) if on_progress: on_progress(done=done_offset + i + 1, total=len(all_periods), message=f"Wrote {period_id}") diff --git a/climate_api/ingest/plugins/era5_land.py b/climate_api/ingest/plugins/era5_land.py index 97af25a3..f1a103e3 100644 --- a/climate_api/ingest/plugins/era5_land.py +++ b/climate_api/ingest/plugins/era5_land.py @@ -132,22 +132,30 @@ def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: # ------------------------------------------------------------------ def _build_periods(self, start: str, end: str) -> list[str]: - """Generate hourly period IDs, clamped to the provider's availability lag.""" + """Generate hourly period IDs, clamped to the provider's availability lag. + + start and end are period-ID strings of the form 'YYYY-MM-DDTHH'. + The comparison is lexicographic so the filter respects the hour component. + """ cutoff = date.today() - timedelta(days=_LAG_DAYS) start_dt = date.fromisoformat(start[:10]) end_dt = min(date.fromisoformat(end[:10]), cutoff) + # Cutoff clamped to end-of-day of the cutoff date so we filter later. + cutoff_period = f"{cutoff.isoformat()}T23" periods: list[str] = [] current = start_dt while current <= end_dt: _, last_day = calendar.monthrange(current.year, current.month) - for day in range(current.day if current == start_dt else 1, last_day + 1): - d = current.replace(day=day) - if d > end_dt: - break + for day_num in range(1, last_day + 1): + d = current.replace(day=day_num) + if d < start_dt or d > end_dt: + continue for hour in range(24): - periods.append(f"{d.isoformat()}T{hour:02d}") - # Advance to first day of next month + p = f"{d.isoformat()}T{hour:02d}" + if p < start or p > end or p > cutoff_period: + continue + periods.append(p) if current.month == 12: current = current.replace(year=current.year + 1, month=1, day=1) else: diff --git a/tests/test_ingest_orchestrator.py b/tests/test_ingest_orchestrator.py index 0a3717d6..40f25720 100644 --- a/tests/test_ingest_orchestrator.py +++ b/tests/test_ingest_orchestrator.py @@ -324,3 +324,31 @@ def test_load_plugin_raises_for_non_protocol() -> None: def test_read_committed_period_ids_empty_when_no_store(tmp_path: Path) -> None: assert read_committed_period_ids(tmp_path / "nostore.icechunk", "monthly") == set() + + +# --------------------------------------------------------------------------- +# Era5LandPlugin._build_periods (unit tests, no network) +# --------------------------------------------------------------------------- + +def test_era5land_build_periods_respects_hour_component() -> None: + from climate_api.ingest.plugins.era5_land import Era5LandPlugin + + plugin = Era5LandPlugin(variable="t2m") + periods = plugin._build_periods("2024-01-01T06", "2024-01-01T08") + assert periods == ["2024-01-01T06", "2024-01-01T07", "2024-01-01T08"] + + +def test_era5land_build_periods_single_hour() -> None: + from climate_api.ingest.plugins.era5_land import Era5LandPlugin + + plugin = Era5LandPlugin(variable="t2m") + periods = plugin._build_periods("2024-01-01T00", "2024-01-01T00") + assert periods == ["2024-01-01T00"] + + +def test_era5land_build_periods_spans_months() -> None: + from climate_api.ingest.plugins.era5_land import Era5LandPlugin + + plugin = Era5LandPlugin(variable="t2m") + periods = plugin._build_periods("2024-01-31T23", "2024-02-01T01") + assert periods == ["2024-01-31T23", "2024-02-01T00", "2024-02-01T01"] From 1285d4696af46f399f263cde7f22f176fd475a03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 16:19:14 +0200 Subject: [PATCH 03/80] fix: strip zarr v2 Blosc encoding before writing to icechunk store ERA5-Land source data carries zarr v2 Blosc codec metadata. When xarray loads the dataset it retains the source encoding, which fails with a BytesBytesCodec type error when writing to a zarr v3/icechunk store. Clearing encoding on all vars and coords before returning from fetch lets the orchestrator write with zarr v3-compatible defaults. --- climate_api/ingest/plugins/era5_land.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/climate_api/ingest/plugins/era5_land.py b/climate_api/ingest/plugins/era5_land.py index f1a103e3..504a0f56 100644 --- a/climate_api/ingest/plugins/era5_land.py +++ b/climate_api/ingest/plugins/era5_land.py @@ -125,7 +125,12 @@ def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: ds = ds.expand_dims("time").assign_coords(time=[ds.valid_time.values]) ds = ds.rename({"longitude": "x", "latitude": "y"}) - return ds.load() + ds = ds.load() + # Strip zarr v2 codec encoding (Blosc) so the orchestrator writes + # with zarr v3-compatible defaults into the icechunk store. + for name in list(ds.data_vars) + list(ds.coords): + ds[name].encoding.clear() + return ds # ------------------------------------------------------------------ # Period generation From 8855cfae5da0ae9273a2e4991ce1b7317c9d365f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 16:26:48 +0200 Subject: [PATCH 04/80] fix: pin time encoding to hours epoch so hourly appends land on correct timestamps When encoding is cleared and xarray picks a default unit for the first write it may choose "days since YYYY-MM-DD". Subsequent append_dim writes then encode sub-daily offsets as integer days, shifting all non-midnight hours to the wrong date. Explicitly setting "hours since 1970-01-01" on every fetched period ensures the first write establishes a unit that all appends can faithfully represent as integers. --- climate_api/ingest/plugins/era5_land.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/climate_api/ingest/plugins/era5_land.py b/climate_api/ingest/plugins/era5_land.py index 504a0f56..441b627a 100644 --- a/climate_api/ingest/plugins/era5_land.py +++ b/climate_api/ingest/plugins/era5_land.py @@ -130,6 +130,11 @@ def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: # with zarr v3-compatible defaults into the icechunk store. for name in list(ds.data_vars) + list(ds.coords): ds[name].encoding.clear() + # Pin time to a stable hourly unit so every period append uses the + # same encoding. Without this, the first write picks "days since …" + # and sub-daily values on subsequent appends land on the wrong hour. + if "time" in ds.coords: + ds["time"].encoding.update({"units": "hours since 1970-01-01", "dtype": "int64"}) return ds # ------------------------------------------------------------------ From 67d1ccf64839361433749a39e6f828d857335f98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 16:47:11 +0200 Subject: [PATCH 05/80] =?UTF-8?q?feat:=20store-based=20sync=20for=20Icechu?= =?UTF-8?q?nk=20=E2=80=94=20use=20read=5Fcommitted=5Fperiod=5Fids=20as=20s?= =?UTF-8?q?ource=20of=20truth?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three related changes: 1. plan_sync accepts an optional current_end parameter that overrides the artifact metadata's coverage.temporal.end. For Icechunk stores the artifact record may lag behind what is actually committed on disk, so callers pass the store-authoritative value. 2. _supports_append recognises ArtifactFormat.ICECHUNK and always returns True. The orchestrator already handles incremental appends via its own read_committed_period_ids call; no sync.execution: append YAML flag is required for Icechunk datasets. 3. sync_dataset reads read_committed_period_ids directly from the Icechunk store before calling run_sync and passes the result as current_end. This ensures the sync plan reflects the true on-disk state rather than stale metadata. Additionally, _create_icechunk_artifact now accepts an optional ingest_start parameter. For append syncs this is set to the delta_start so the orchestrator scans only the new period range rather than the full historical range (e.g. from 1950), which avoids enumerating hundreds of thousands of already-committed periods needlessly. --- climate_api/ingestions/services.py | 38 +++- climate_api/ingestions/sync_engine.py | 27 ++- tests/test_datasets_sync.py | 305 ++++++++++++++++++++++++++ 3 files changed, 364 insertions(+), 6 deletions(-) diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 98eb829b..59fe358d 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -207,6 +207,7 @@ def create_artifact( bbox=bbox, request_scope=request_scope, publish=publish, + ingest_start=download_start, ) logger.info( @@ -337,8 +338,15 @@ def _create_icechunk_artifact( bbox: list[float] | None, request_scope: ArtifactRequestScope, publish: bool, + ingest_start: str | None = None, ) -> ArtifactRecord: - """Run per-period Icechunk ingest and register the resulting store as an artifact.""" + """Run per-period Icechunk ingest and register the resulting store as an artifact. + + `ingest_start` is the period from which the orchestrator begins its period scan. + For delta/append syncs this is the first missing period (delta_start), which avoids + enumerating the entire historical range just to discover that all prior periods are + already committed. When omitted the full artifact `start` is used. + """ from climate_api.ingest.orchestrator import load_plugin, run_ingest_sync from climate_api.ingest.store import open_or_create_repo @@ -356,12 +364,16 @@ def _create_icechunk_artifact( plugin = load_plugin(plugin_path, params) - logger.info("Running Icechunk ingest for '%s': %s..%s", dataset_id, start, end) + effective_start = ingest_start if ingest_start is not None else start + logger.info( + "Running Icechunk ingest for '%s': ingest_scope=%s..%s artifact_scope=%s..%s", + dataset_id, effective_start, end, start, end, + ) run_ingest_sync( plugin=plugin, params=params, bbox=resolved_bbox, - start=start, + start=effective_start, end=end, store_path=store_path, period_type=period_type, @@ -496,6 +508,10 @@ def sync_dataset( The service layer stays thin on purpose: it validates that the requested public dataset id resolves to a managed dataset plus a source template, then hands execution to `sync_engine.run_sync(...)`. + + For Icechunk artifacts the authoritative `current_end` is read directly from + the store's committed period log rather than from the potentially-stale artifact + metadata record, so the sync plan reflects the true on-disk state. """ latest_artifact = get_latest_artifact_for_dataset_or_404(dataset_id) source_dataset = registry_datasets.get_dataset(latest_artifact.dataset_id) @@ -503,6 +519,21 @@ def sync_dataset( raise HTTPException(status_code=404, detail=f"Source dataset '{latest_artifact.dataset_id}' not found") extent = get_extent() resolved_country_code = extent.get("country_code") if extent else None + + committed_end: str | None = None + if latest_artifact.format == ArtifactFormat.ICECHUNK and latest_artifact.path: + from climate_api.ingest.store import read_committed_period_ids + + period_type = str(source_dataset.get("period_type", "")) + committed = read_committed_period_ids(Path(latest_artifact.path), period_type) + committed_end = max(committed) if committed else None + logger.info( + "Icechunk store-based current_end for '%s': %s (artifact record had: %s)", + dataset_id, + committed_end, + latest_artifact.coverage.temporal.end, + ) + try: return run_sync( latest_artifact=latest_artifact, @@ -513,6 +544,7 @@ def sync_dataset( publish=publish, create_artifact_fn=create_artifact, get_dataset_fn=get_dataset_or_404, + current_end=committed_end, ) except SyncConfigurationError as exc: raise HTTPException(status_code=500, detail=str(exc)) from exc diff --git a/climate_api/ingestions/sync_engine.py b/climate_api/ingestions/sync_engine.py index 1685065f..221adfe1 100644 --- a/climate_api/ingestions/sync_engine.py +++ b/climate_api/ingestions/sync_engine.py @@ -42,6 +42,7 @@ def plan_sync( source_dataset: dict[str, Any], latest_artifact: ArtifactRecord, requested_end: str | None, + current_end: str | None = None, ) -> SyncDetail: """Return the sync decision for one managed dataset without changing local state. @@ -54,6 +55,10 @@ def plan_sync( - release datasets compare the current materialized release against the requested end - static datasets are marked as not syncable + `current_end` overrides `latest_artifact.coverage.temporal.end` when provided. + Callers pass the store-authoritative value for formats (e.g. Icechunk) where the + artifact metadata record may lag behind what is actually committed on disk. + This planner deliberately does not download data or persist artifacts. """ sync_kind_value = source_dataset.get("sync", {}).get("kind") @@ -61,7 +66,7 @@ def plan_sync( raise ValueError("source_dataset must define sync.kind for sync planning") sync_kind = SyncKind(sync_kind_value) current_start = latest_artifact.request_scope.start - current_end = latest_artifact.coverage.temporal.end + current_end = current_end if current_end is not None else latest_artifact.coverage.temporal.end if sync_kind == SyncKind.STATIC: return SyncDetail( @@ -168,6 +173,7 @@ def run_sync( publish: bool, create_artifact_fn: Callable[..., ArtifactRecord], get_dataset_fn: Callable[[str], Any], + current_end: str | None = None, ) -> SyncResponse: """Plan and execute one sync operation for a managed dataset. @@ -183,6 +189,7 @@ def run_sync( source_dataset=source_dataset, latest_artifact=latest_artifact, requested_end=requested_end, + current_end=current_end, ) dataset_id = managed_dataset_id_for(latest_artifact) logger.info( @@ -360,13 +367,27 @@ def _latest_available_end(*, source_dataset: dict[str, Any], requested_end: str) def _supports_append(source_dataset: dict[str, Any], latest_artifact: ArtifactRecord) -> bool: - """Return whether this template opts into V1 delta-download sync execution.""" - from pathlib import Path + """Return whether this artifact supports incremental append sync execution. + + Icechunk stores always support append: the orchestrator uses read_committed_period_ids + to determine exactly which periods are missing and commits only those. No YAML + sync.execution flag is required. + + For all other formats the YAML must opt in with sync.execution: append, and + pyramid zarr stores (identified by a "0/" subdirectory) are excluded because + they must be rebuilt in full. + """ + from climate_api.ingestions.schemas import ArtifactFormat + + if latest_artifact.format == ArtifactFormat.ICECHUNK: + return True if source_dataset.get("sync", {}).get("execution") != SyncAction.APPEND.value: return False # Pyramid zarr stores cannot be appended to — they must be rebuilt in full. # Detect this from the existing artifact's on-disk structure rather than YAML. + from pathlib import Path + artifact_path = latest_artifact.path if artifact_path and "://" not in artifact_path and (Path(artifact_path) / "0").is_dir(): logger.warning( diff --git a/tests/test_datasets_sync.py b/tests/test_datasets_sync.py index cbd7cd13..6694f9cb 100644 --- a/tests/test_datasets_sync.py +++ b/tests/test_datasets_sync.py @@ -860,3 +860,308 @@ def fake_run_sync(**kwargs: object) -> SyncResponse: services.sync_dataset(dataset_id=dataset_id, end="2021", prefer_zarr=True, publish=True) assert captured["country_code"] == "SLE" + + +# --------------------------------------------------------------------------- +# Icechunk store-based sync +# --------------------------------------------------------------------------- + + +def _icechunk_artifact( + *, + artifact_id: str, + source_dataset_id: str = "era5land_temperature_hourly", + managed_dataset_id: str = "era5land_temperature_hourly_nor", + end: str = "2024-01-01T03", + path: str = "/tmp/era5land_temperature_hourly.icechunk", +) -> ArtifactRecord: + return ArtifactRecord( + artifact_id=artifact_id, + dataset_id=source_dataset_id, + dataset_name="2m temperature (ERA5-Land)", + variable="t2m", + format=ArtifactFormat.ICECHUNK, + path=path, + asset_paths=[path], + variables=["t2m"], + request_scope=ArtifactRequestScope( + start="2024-01-01T00", + end=end, + bbox=(4.0, 57.5, 31.5, 71.5), + ), + coverage=ArtifactCoverage( + temporal=CoverageTemporal(start="2024-01-01T00", end=end), + spatial=CoverageSpatial(xmin=4.0, ymin=57.5, xmax=31.5, ymax=71.5), + ), + created_at=datetime.fromisoformat("2024-01-01T04:00:00+00:00"), + publication=ArtifactPublication(status=PublicationStatus.PUBLISHED), + ) + + +def test_plan_sync_uses_current_end_override_instead_of_artifact_metadata() -> None: + """current_end parameter takes precedence over latest_artifact.coverage.temporal.end.""" + artifact = _icechunk_artifact(artifact_id="a1", end="2024-01-01T03") + + result = sync_engine.plan_sync( + source_dataset={ + "id": "era5land_temperature_hourly", + "period_type": "hourly", + "sync": {"kind": "temporal"}, + }, + latest_artifact=artifact, + requested_end="2024-01-01T06", + current_end="2024-01-01T05", + ) + + assert result.current_end == "2024-01-01T05" + assert result.delta_start == "2024-01-01T06" + assert result.delta_end == "2024-01-01T06" + assert result.target_end == "2024-01-01T06" + + +def test_plan_sync_falls_back_to_artifact_end_when_no_override() -> None: + artifact = _icechunk_artifact(artifact_id="a1", end="2024-01-01T03") + + result = sync_engine.plan_sync( + source_dataset={ + "id": "era5land_temperature_hourly", + "period_type": "hourly", + "sync": {"kind": "temporal"}, + }, + latest_artifact=artifact, + requested_end="2024-01-01T06", + ) + + assert result.current_end == "2024-01-01T03" + assert result.delta_start == "2024-01-01T04" + + +def test_supports_append_returns_true_for_icechunk_format_without_yaml_execution_flag() -> None: + """ICECHUNK format always supports append — no sync.execution: append needed in YAML.""" + artifact = _icechunk_artifact(artifact_id="a1") + + result = sync_engine._supports_append( + source_dataset={"id": "era5land_temperature_hourly", "period_type": "hourly", "sync": {"kind": "temporal"}}, + latest_artifact=artifact, + ) + + assert result is True + + +def test_supports_append_requires_yaml_execution_flag_for_zarr_format() -> None: + zarr_artifact = _artifact(artifact_id="a1", end="2026-01-10") + + without_flag = sync_engine._supports_append( + source_dataset={"id": "chirps3_precipitation_daily", "period_type": "daily", "sync": {"kind": "temporal"}}, + latest_artifact=zarr_artifact, + ) + with_flag = sync_engine._supports_append( + source_dataset={ + "id": "chirps3_precipitation_daily", + "period_type": "daily", + "sync": {"kind": "temporal", "execution": "append"}, + }, + latest_artifact=zarr_artifact, + ) + + assert without_flag is False + assert with_flag is True + + +def test_sync_dataset_reads_committed_end_from_icechunk_store(monkeypatch: pytest.MonkeyPatch) -> None: + """sync_dataset passes the store-authoritative current_end to run_sync for ICECHUNK artifacts.""" + dataset_id = "era5land_temperature_hourly_nor" + latest = _icechunk_artifact(artifact_id="a1", end="2024-01-01T03") + monkeypatch.setattr(services, "get_latest_artifact_for_dataset_or_404", lambda _: latest) + monkeypatch.setattr( + services.registry_datasets, + "get_dataset", + lambda _: { + "id": "era5land_temperature_hourly", + "period_type": "hourly", + "sync": {"kind": "temporal"}, + }, + ) + + # Store has T00-T05; artifact record only knows about T03. + import climate_api.ingest.store as ingest_store + + monkeypatch.setattr( + ingest_store, + "read_committed_period_ids", + lambda path, period_type: {"2024-01-01T00", "2024-01-01T01", "2024-01-01T02", "2024-01-01T03", + "2024-01-01T04", "2024-01-01T05"}, + ) + + captured: dict[str, object] = {} + + def fake_run_sync(**kwargs: object) -> SyncResponse: + captured.update(kwargs) + return SyncResponse( + sync_id=None, + status="up_to_date", + message="ok", + dataset=_dataset_detail(dataset_id), + sync_detail=SyncDetail( + source_dataset_id="era5land_temperature_hourly", + sync_kind=SyncKind.TEMPORAL, + action=SyncAction.NO_OP, + reason="no_new_period", + message="ok", + current_start="2024-01-01T00", + current_end="2024-01-01T05", + target_end="2024-01-01T05", + target_end_source="request", + ), + ) + + monkeypatch.setattr(services, "run_sync", fake_run_sync) + + services.sync_dataset(dataset_id=dataset_id, end="2024-01-01T05", prefer_zarr=False, publish=False) + + assert captured["current_end"] == "2024-01-01T05" + + +def test_sync_dataset_icechunk_store_empty_uses_none_current_end(monkeypatch: pytest.MonkeyPatch) -> None: + """When the store has no committed periods yet, current_end is None (full ingest).""" + dataset_id = "era5land_temperature_hourly_nor" + latest = _icechunk_artifact(artifact_id="a1", end="2024-01-01T03") + monkeypatch.setattr(services, "get_latest_artifact_for_dataset_or_404", lambda _: latest) + monkeypatch.setattr( + services.registry_datasets, + "get_dataset", + lambda _: {"id": "era5land_temperature_hourly", "period_type": "hourly", "sync": {"kind": "temporal"}}, + ) + import climate_api.ingest.store as ingest_store + + monkeypatch.setattr(ingest_store, "read_committed_period_ids", lambda *_: set()) + + captured: dict[str, object] = {} + + def fake_run_sync(**kwargs: object) -> SyncResponse: + captured.update(kwargs) + return SyncResponse( + sync_id="a2", + status="completed", + message="ok", + dataset=_dataset_detail(dataset_id), + sync_detail=SyncDetail( + source_dataset_id="era5land_temperature_hourly", + sync_kind=SyncKind.TEMPORAL, + action=SyncAction.REMATERIALIZE, + reason="new_periods_available", + message="ok", + current_start="2024-01-01T00", + current_end="2024-01-01T03", + target_end="2024-01-01T06", + target_end_source="request", + ), + ) + + monkeypatch.setattr(services, "run_sync", fake_run_sync) + + services.sync_dataset(dataset_id=dataset_id, end="2024-01-01T06", prefer_zarr=False, publish=False) + + assert captured["current_end"] is None + + +def _patch_icechunk_artifact_dependencies( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + captured: dict[str, object], +) -> None: + """Patch all inline imports used by _create_icechunk_artifact.""" + import climate_api.ingest.orchestrator as orchestrator_mod + import climate_api.ingest.store as store_mod + from climate_api.ingestions import services as svc + import xarray as xr + import numpy as np + + def fake_run_ingest_sync(**kwargs: object) -> None: + captured.update(kwargs) + + monkeypatch.setattr(orchestrator_mod, "run_ingest_sync", fake_run_ingest_sync) + monkeypatch.setattr(orchestrator_mod, "load_plugin", lambda path, params: object()) + monkeypatch.setattr(store_mod, "open_or_create_repo", lambda _: _FakeRepo()) + monkeypatch.setattr(svc, "coverage_from_open_dataset", lambda ds, **_: { + "has_data": True, + "coverage": { + "temporal": {"start": "2024-01-01T00", "end": "2024-01-01T06"}, + "spatial": {"xmin": 4.0, "ymin": 57.5, "xmax": 31.5, "ymax": 71.5}, + }, + }) + monkeypatch.setattr( + xr, "open_zarr", + lambda *_a, **_k: xr.Dataset({"t2m": xr.DataArray(np.zeros((1,)), dims=["time"])}), + ) + monkeypatch.setattr(svc, "get_extent", lambda: None) + monkeypatch.setattr(svc.downloader, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(svc, "_store_artifact_record", lambda record, **_: record) + + +class _FakeRepo: + def readonly_session(self, _: str) -> "_FakeSession": + return _FakeSession() + + +class _FakeSession: + store = None + + +def test_create_icechunk_artifact_uses_ingest_start_for_delta_efficiency( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + """_create_icechunk_artifact passes ingest_start to run_ingest_sync to skip prior periods.""" + from climate_api.ingestions import services as svc + + captured: dict[str, object] = {} + _patch_icechunk_artifact_dependencies(monkeypatch, tmp_path, captured) + + dataset = { + "id": "era5land_temperature_hourly", + "name": "2m temperature (ERA5-Land)", + "variable": "t2m", + "period_type": "hourly", + "ingestion": {"plugin": "climate_api.ingest.plugins.era5_land.Era5LandPlugin", "params": {"variable": "t2m"}}, + } + + svc._create_icechunk_artifact( + dataset=dataset, + start="2024-01-01T00", + end="2024-01-01T06", + bbox=None, + request_scope=ArtifactRequestScope(start="2024-01-01T00", end="2024-01-01T06", bbox=None), + publish=False, + ingest_start="2024-01-01T04", + ) + + assert captured["start"] == "2024-01-01T04" + + +def test_create_icechunk_artifact_uses_full_start_when_no_ingest_start( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + """Without ingest_start, run_ingest_sync receives the artifact's full start.""" + from climate_api.ingestions import services as svc + + captured: dict[str, object] = {} + _patch_icechunk_artifact_dependencies(monkeypatch, tmp_path, captured) + + dataset = { + "id": "era5land_temperature_hourly", + "name": "2m temperature (ERA5-Land)", + "variable": "t2m", + "period_type": "hourly", + "ingestion": {"plugin": "climate_api.ingest.plugins.era5_land.Era5LandPlugin", "params": {"variable": "t2m"}}, + } + + svc._create_icechunk_artifact( + dataset=dataset, + start="2024-01-01T00", + end="2024-01-01T06", + bbox=None, + request_scope=ArtifactRequestScope(start="2024-01-01T00", end="2024-01-01T06", bbox=None), + publish=False, + ) + + assert captured["start"] == "2024-01-01T00" From 438a2fcfb9d955fc1935146d87c05ddbb51c52ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 17:04:14 +0200 Subject: [PATCH 06/80] =?UTF-8?q?feat:=20rechunk=20Icechunk=20store=20in-p?= =?UTF-8?q?lace=20after=20initial=20ingest=20(time:=201=20=E2=86=92=20time?= =?UTF-8?q?:=20N)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds rechunk_store(store_path, time_chunk=N) in store.py. The function opens the latest committed snapshot via a readonly session, rewrites all variables with the new time chunk size via a writable session (using dask for lazy chunk-by-chunk processing), and commits the result as a new Icechunk snapshot. MVCC ensures the previous snapshot is preserved if the rechunk fails. run_ingest / run_ingest_sync gain a rechunk_time: int | None parameter. When set, rechunk_store is called after all periods have been committed. Era5LandPlugin declares rechunk_time = 12 (twelve hourly periods per chunk). _create_icechunk_artifact reads this attribute and passes it to run_ingest_sync only on initial ingest (ingest_start is None); sync appends skip rechunking to avoid rewriting the full store on every small update. --- climate_api/ingest/orchestrator.py | 9 +- climate_api/ingest/plugins/era5_land.py | 1 + climate_api/ingest/store.py | 47 ++++++++ climate_api/ingestions/services.py | 9 +- tests/test_ingest_orchestrator.py | 142 ++++++++++++++++++++++++ 5 files changed, 205 insertions(+), 3 deletions(-) diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py index c6c97eff..bae713b3 100644 --- a/climate_api/ingest/orchestrator.py +++ b/climate_api/ingest/orchestrator.py @@ -22,7 +22,7 @@ import xarray as xr from climate_api.ingest.protocol import GridSpec, IngestionPlugin -from climate_api.ingest.store import open_or_create_repo, read_committed_period_ids +from climate_api.ingest.store import open_or_create_repo, read_committed_period_ids, rechunk_store logger = logging.getLogger(__name__) @@ -58,6 +58,7 @@ async def run_ingest( is_cancel_requested: Callable[[], bool] | None = None, save_cursor: Callable[[dict[str, Any]], None] | None = None, load_cursor: Callable[[], dict[str, Any] | None] | None = None, + rechunk_time: int | None = None, ) -> None: """Probe the source then stream per-period data into an Icechunk store. @@ -154,6 +155,10 @@ async def _fetch(period_id: str) -> xr.Dataset: if not spec.time_dim: break + if rechunk_time is not None and spec.time_dim: + logger.info("Rechunking %s after ingest: time chunk → %d", store_path, rechunk_time) + rechunk_store(store_path, time_chunk=rechunk_time) + def run_ingest_sync( *, @@ -168,6 +173,7 @@ def run_ingest_sync( is_cancel_requested: Callable[[], bool] | None = None, save_cursor: Callable[[dict[str, Any]], None] | None = None, load_cursor: Callable[[], dict[str, Any] | None] | None = None, + rechunk_time: int | None = None, ) -> None: """Synchronous wrapper around run_ingest for use in threaded job workers.""" asyncio.run( @@ -183,5 +189,6 @@ def run_ingest_sync( is_cancel_requested=is_cancel_requested, save_cursor=save_cursor, load_cursor=load_cursor, + rechunk_time=rechunk_time, ) ) diff --git a/climate_api/ingest/plugins/era5_land.py b/climate_api/ingest/plugins/era5_land.py index 441b627a..a882a2a7 100644 --- a/climate_api/ingest/plugins/era5_land.py +++ b/climate_api/ingest/plugins/era5_land.py @@ -44,6 +44,7 @@ class Era5LandPlugin: max_concurrency = 4 commit_batch_size = 720 # one month of hourly periods + rechunk_time = 12 # group 12 hourly periods per chunk after initial ingest def __init__(self, variable: str) -> None: self.variable = variable diff --git a/climate_api/ingest/store.py b/climate_api/ingest/store.py index 3210e2e1..564cfde1 100644 --- a/climate_api/ingest/store.py +++ b/climate_api/ingest/store.py @@ -22,6 +22,53 @@ def open_or_create_repo(store_path: Path) -> "icechunk.Repository": return icechunk.Repository.create(storage) +def rechunk_store(store_path: Path, *, time_chunk: int) -> None: + """Rewrite the committed Icechunk store with a coarser time chunk size. + + Opens the latest committed snapshot for reading and a new writable session + for writing, lazily rechunks the time dimension via dask, then commits the + result as a new snapshot. Icechunk's MVCC ensures the previous snapshot is + preserved — if the rechunk fails the store rolls back to its original state. + + A no-op when the store does not exist or has no time dimension. + """ + import xarray as xr + + if not store_path.exists(): + return + + repo = open_or_create_repo(store_path) + read_session = repo.readonly_session("main") + ds = xr.open_zarr(read_session.store) + + n_times = ds.sizes.get("time", 0) + if n_times == 0: + return + + effective_chunk = min(time_chunk, n_times) + encoding: dict[str, dict] = {} + for name in list(ds.data_vars) + list(ds.coords): + da = ds[name] + existing = dict(da.encoding) + if "time" in da.dims: + current = existing.get("chunks") + if isinstance(current, (list, tuple)): + new_chunks = list(current) + new_chunks[list(da.dims).index("time")] = effective_chunk + else: + new_chunks = [ + effective_chunk if dim == "time" else da.sizes[dim] + for dim in da.dims + ] + existing["chunks"] = new_chunks + encoding[name] = existing + + write_session = repo.writable_session("main") + ds.chunk({"time": effective_chunk}).to_zarr(write_session.store, mode="w", encoding=encoding) + write_session.commit(f"rechunk: time={effective_chunk}") + logger.info("Rechunked %s: time chunk → %d (%d periods)", store_path, effective_chunk, n_times) + + def read_committed_period_ids(store_path: Path, period_type: str) -> set[str]: """Return the set of period IDs already committed to the Icechunk store. diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 59fe358d..19c42ea7 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -365,9 +365,13 @@ def _create_icechunk_artifact( plugin = load_plugin(plugin_path, params) effective_start = ingest_start if ingest_start is not None else start + # Rechunk after the initial ingest (when no delta start is provided) using the + # plugin's declared rechunk_time, if any. Sync appends skip rechunking to avoid + # rewriting the full store on every small update. + rechunk_time: int | None = getattr(plugin, "rechunk_time", None) if ingest_start is None else None logger.info( - "Running Icechunk ingest for '%s': ingest_scope=%s..%s artifact_scope=%s..%s", - dataset_id, effective_start, end, start, end, + "Running Icechunk ingest for '%s': ingest_scope=%s..%s artifact_scope=%s..%s rechunk_time=%s", + dataset_id, effective_start, end, start, end, rechunk_time, ) run_ingest_sync( plugin=plugin, @@ -377,6 +381,7 @@ def _create_icechunk_artifact( end=end, store_path=store_path, period_type=period_type, + rechunk_time=rechunk_time, ) repo = open_or_create_repo(store_path) diff --git a/tests/test_ingest_orchestrator.py b/tests/test_ingest_orchestrator.py index 40f25720..a223b78b 100644 --- a/tests/test_ingest_orchestrator.py +++ b/tests/test_ingest_orchestrator.py @@ -352,3 +352,145 @@ def test_era5land_build_periods_spans_months() -> None: plugin = Era5LandPlugin(variable="t2m") periods = plugin._build_periods("2024-01-31T23", "2024-02-01T01") assert periods == ["2024-01-31T23", "2024-02-01T00", "2024-02-01T01"] + + +# --------------------------------------------------------------------------- +# Rechunking +# --------------------------------------------------------------------------- + +def _time_chunk_size(store_path: Path) -> int: + """Read the time chunk size of the first data variable from the committed store.""" + import icechunk + import zarr + + repo = icechunk.Repository.open(icechunk.local_filesystem_storage(str(store_path))) + session = repo.readonly_session("main") + g = zarr.open_group(session.store, mode="r") + for name in g.array_keys(): + arr = g[name] + dims = list(arr.metadata.dimension_names or []) + if "time" in dims: + return arr.chunks[dims.index("time")] + raise AssertionError("No array with a time dimension found") + + +def test_run_ingest_rechunks_store_after_all_periods(tmp_path: Path) -> None: + """rechunk_time=N rewrites the store so the time chunk size is N after ingest.""" + plugin = FakePlugin(["2024-01", "2024-02", "2024-03", "2024-04"]) + store_path = tmp_path / "test.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-04", + store_path=store_path, + period_type="monthly", + rechunk_time=2, + ) + ) + + assert read_committed_period_ids(store_path, "monthly") == {"2024-01", "2024-02", "2024-03", "2024-04"} + assert _time_chunk_size(store_path) == 2 + + +def test_run_ingest_rechunk_preserves_all_periods_in_store(tmp_path: Path) -> None: + """After rechunking, read_committed_period_ids returns the same set as before.""" + plugin = FakePlugin(["2024-01", "2024-02", "2024-03"]) + store_path = tmp_path / "test.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-03", + store_path=store_path, + period_type="monthly", + rechunk_time=3, + ) + ) + + assert read_committed_period_ids(store_path, "monthly") == {"2024-01", "2024-02", "2024-03"} + assert _time_chunk_size(store_path) == 3 + + +def test_run_ingest_no_rechunk_when_rechunk_time_is_none(tmp_path: Path) -> None: + """Without rechunk_time the time chunk stays at 1 (one period per commit).""" + plugin = FakePlugin(["2024-01", "2024-02", "2024-03"]) + store_path = tmp_path / "test.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-03", + store_path=store_path, + period_type="monthly", + ) + ) + + assert _time_chunk_size(store_path) == 1 + + +def test_rechunk_store_noop_on_nonexistent_store(tmp_path: Path) -> None: + from climate_api.ingest.store import rechunk_store + + rechunk_store(tmp_path / "nostore.icechunk", time_chunk=12) + + +def test_rechunk_store_changes_chunk_size(tmp_path: Path) -> None: + """rechunk_store can be called directly to rechunk an existing store.""" + from climate_api.ingest.store import rechunk_store + + plugin = FakePlugin(["2024-01", "2024-02", "2024-03", "2024-04"]) + store_path = tmp_path / "test.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-04", + store_path=store_path, + period_type="monthly", + ) + ) + + assert _time_chunk_size(store_path) == 1 + + rechunk_store(store_path, time_chunk=4) + + assert _time_chunk_size(store_path) == 4 + assert read_committed_period_ids(store_path, "monthly") == {"2024-01", "2024-02", "2024-03", "2024-04"} + + +def test_rechunk_store_skips_when_no_time_dimension(tmp_path: Path) -> None: + """rechunk_store is a no-op when the store has no time dimension.""" + import icechunk + import xarray as xr + import numpy as np + from climate_api.ingest.store import rechunk_store + + store_path = tmp_path / "static.icechunk" + storage = icechunk.local_filesystem_storage(str(store_path)) + repo = icechunk.Repository.create(storage) + ds = xr.Dataset({"elevation": xr.DataArray(np.zeros((4, 4), dtype="float32"), dims=["y", "x"])}) + session = repo.writable_session("main") + ds.to_zarr(session.store, mode="w") + session.commit("static write") + + rechunk_store(store_path, time_chunk=12) + + +def test_era5land_plugin_declares_rechunk_time() -> None: + from climate_api.ingest.plugins.era5_land import Era5LandPlugin + + plugin = Era5LandPlugin(variable="t2m") + assert plugin.rechunk_time == 12 From fe1bb5f18bb1ec5591edb40d4aa39fc1cd8a00e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 17:49:52 +0200 Subject: [PATCH 07/80] feat: CHIRPS3 and WorldPop IngestionPlugin implementations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CHIRPS3 fetches daily COG files via HTTP range requests — one period per day, capped to the last complete published month (lag ~1–2 months). Four concurrent fetches via thread pool; commits batched at 30 periods (~1/month). WorldPop downloads per-country GeoTIFFs for global2 (2015–2030, 100 m constrained) or global1 (2000–2020). Max concurrency 1 — full-country files can be several hundred MB. Both plugins reuse the probe-estimate pattern (no network call) to derive GridSpec from known pixel resolution. Includes dataset YAML registrations and full unit test coverage (41 tests). --- climate_api/data/datasets/chirps3.yaml | 4 + climate_api/data/datasets/worldpop.yaml | 5 + climate_api/ingest/plugins/chirps3.py | 167 ++++++++++ climate_api/ingest/plugins/worldpop.py | 138 ++++++++ tests/test_ingest_plugins.py | 402 ++++++++++++++++++++++++ 5 files changed, 716 insertions(+) create mode 100644 climate_api/ingest/plugins/chirps3.py create mode 100644 climate_api/ingest/plugins/worldpop.py create mode 100644 tests/test_ingest_plugins.py diff --git a/climate_api/data/datasets/chirps3.yaml b/climate_api/data/datasets/chirps3.yaml index a7a13a8c..652c814f 100644 --- a/climate_api/data/datasets/chirps3.yaml +++ b/climate_api/data/datasets/chirps3.yaml @@ -15,6 +15,10 @@ begin: "1981-01-01" resolution: P1D ingestion: + plugin: climate_api.ingest.plugins.chirps3.Chirps3Plugin + params: + stage: final + flavor: rnl function: dhis2eo.data.chc.chirps3.daily.download units: mm resolution: 5 km x 5 km diff --git a/climate_api/data/datasets/worldpop.yaml b/climate_api/data/datasets/worldpop.yaml index b1d6aa9a..82c4a517 100644 --- a/climate_api/data/datasets/worldpop.yaml +++ b/climate_api/data/datasets/worldpop.yaml @@ -17,6 +17,11 @@ end: "2030" resolution: P1Y ingestion: + plugin: climate_api.ingest.plugins.worldpop.WorldPopPlugin + params: + # country_code is required — set the ISO 3166-1 alpha-3 code for your deployment + # e.g. country_code: NOR + version: global2 function: dhis2eo.data.worldpop.pop_total.yearly.download default_params: version: global2 diff --git a/climate_api/ingest/plugins/chirps3.py b/climate_api/ingest/plugins/chirps3.py new file mode 100644 index 00000000..2faf90b4 --- /dev/null +++ b/climate_api/ingest/plugins/chirps3.py @@ -0,0 +1,167 @@ +"""CHIRPS3 IngestionPlugin — daily precipitation from CHC servers. + +Authentication: none required (public COG files on data.chc.ucsb.edu). + +Daily COG files are fetched with HTTP range requests so only the bbox +window is downloaded per period. CHIRPS3 "final/rnl" data is released +in complete months; availability lags roughly one to two months behind +today depending on which day of the month it is. + +URL layout (final): + https://data.chc.ucsb.edu/products/CHIRPS/v3.0/daily/final/{flavor}/cogs/ + {YYYY}/chirps-v3.0.{flavor}.{YYYY}.{MM}.{DD}.cog + +URL layout (prelim): + https://data.chc.ucsb.edu/products/CHIRPS/v3.0/daily/prelim/sat/ + {YYYY}/chirps-v3.0.prelim.{YYYY}.{MM}.{DD}.tif +""" + +from __future__ import annotations + +import asyncio +import calendar +import logging +from concurrent.futures import ThreadPoolExecutor +from datetime import date, timedelta +from typing import Any + +import numpy as np +import xarray as xr + +from climate_api.ingest.protocol import GridSpec + +logger = logging.getLogger(__name__) + +_CHIRPS3_NODATA = -9999.0 +# CHIRPS3 resolution: 0.05° × 0.05° (~5 km at equator) +_CHIRPS3_RES_DEG = 0.05 +# After the 20th of a month, the previous month is considered complete +_COMPLETE_AFTER_DAY = 20 + +_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="chirps3") + + +class Chirps3Plugin: + """IngestionPlugin for CHIRPS v3 daily precipitation. + + Args: + stage: Data maturity stage — 'final' (default, stable) or 'prelim' + (near-real-time, less reliable). Final data lags ~1–2 months. + flavor: File variant within the stage — 'rnl' or 'sat' for final, + 'sat' for prelim. Defaults to 'rnl' (final/rnl recommended). + """ + + max_concurrency = 4 + commit_batch_size = 30 + rechunk_time = 30 + + def __init__(self, stage: str = "final", flavor: str = "rnl") -> None: + if stage not in {"final", "prelim"}: + raise ValueError(f"stage must be 'final' or 'prelim', got {stage!r}") + if stage == "final" and flavor not in {"rnl", "sat"}: + raise ValueError(f"For stage='final', flavor must be 'rnl' or 'sat', got {flavor!r}") + if stage == "prelim" and flavor != "sat": + raise ValueError(f"For stage='prelim', flavor must be 'sat', got {flavor!r}") + self.stage = stage + self.flavor = flavor + + # ------------------------------------------------------------------ + # Protocol implementation + # ------------------------------------------------------------------ + + async def probe(self, bbox: list[float], **_: Any) -> GridSpec: + """Estimate grid spec from known CHIRPS3 resolution — no data transfer.""" + return self._probe_estimate(bbox) + + async def periods(self, start: str, end: str) -> list[str]: + return self._build_periods(start, end) + + async def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: + return await asyncio.get_running_loop().run_in_executor( + _executor, self._fetch_sync, period_id, bbox + ) + + # ------------------------------------------------------------------ + # Sync helpers (run inside the thread pool) + # ------------------------------------------------------------------ + + def _url_for_day(self, d: date) -> str: + if self.stage == "final": + return ( + f"https://data.chc.ucsb.edu/products/CHIRPS/v3.0/daily/final/" + f"{self.flavor}/cogs/{d.year}/" + f"chirps-v3.0.{self.flavor}.{d.year}.{d.month:02d}.{d.day:02d}.cog" + ) + return ( + f"https://data.chc.ucsb.edu/products/CHIRPS/v3.0/daily/prelim/sat/" + f"{d.year}/chirps-v3.0.prelim.{d.year}.{d.month:02d}.{d.day:02d}.tif" + ) + + def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: + """Fetch one day via COG range request, clip to bbox, return as Dataset.""" + import rioxarray + + d = date.fromisoformat(period_id) + url = self._url_for_day(d) + logger.info("Fetching CHIRPS3 %s: %s", period_id, url) + + da = rioxarray.open_rasterio(url, chunks=None, masked=True, lock=False) + xmin, ymin, xmax, ymax = map(float, bbox) + da = da.rio.clip_box(minx=xmin, miny=ymin, maxx=xmax, maxy=ymax) + da = da.squeeze("band", drop=True) + # Guard against files where the mask was not applied via metadata + da = da.where(da != _CHIRPS3_NODATA) + da = da.load() + + ds = da.to_dataset(name="precip") + ds = ds.expand_dims(time=[np.datetime64(period_id, "D")]) + + for name in list(ds.data_vars) + list(ds.coords): + ds[name].encoding.clear() + ds["time"].encoding.update({"units": "days since 1970-01-01", "dtype": "int32"}) + return ds + + def _probe_estimate(self, bbox: list[float]) -> GridSpec: + """Derive GridSpec from CHIRPS3's known 0.05° resolution.""" + import math + + xmin, ymin, xmax, ymax = map(float, bbox) + nx = max(1, math.ceil((xmax - xmin) / _CHIRPS3_RES_DEG)) + ny = max(1, math.ceil((ymax - ymin) / _CHIRPS3_RES_DEG)) + return GridSpec( + shape=(ny, nx), + crs=4326, + dtype=np.dtype("float32"), + nodata=_CHIRPS3_NODATA, + time_dim=True, + ) + + # ------------------------------------------------------------------ + # Period generation + # ------------------------------------------------------------------ + + def _availability_cutoff(self) -> date: + """Return the last day of the most recent complete published month.""" + today = date.today() + months_back = 1 if today.day > _COMPLETE_AFTER_DAY else 2 + y, m = today.year, today.month + for _ in range(months_back): + m -= 1 + if m == 0: + m, y = 12, y - 1 + last_day = calendar.monthrange(y, m)[1] + return date(y, m, last_day) + + def _build_periods(self, start: str, end: str) -> list[str]: + """Return daily ISO-date strings from start to end, clamped to availability.""" + cutoff = self._availability_cutoff() + start_date = date.fromisoformat(start[:10]) + end_date = min(date.fromisoformat(end[:10]), cutoff) + if start_date > end_date: + return [] + periods: list[str] = [] + current = start_date + while current <= end_date: + periods.append(current.isoformat()) + current += timedelta(days=1) + return periods diff --git a/climate_api/ingest/plugins/worldpop.py b/climate_api/ingest/plugins/worldpop.py new file mode 100644 index 00000000..3c312823 --- /dev/null +++ b/climate_api/ingest/plugins/worldpop.py @@ -0,0 +1,138 @@ +"""WorldPop IngestionPlugin — yearly population count from WorldPop Global2. + +Authentication: none required (public files on data.worldpop.org). + +Files are per-country GeoTIFFs downloaded in full then clipped to bbox. +Global2 (R2025A) covers 2015–2030 at ~100m resolution (3 arc-seconds). +Global1 covers 2000–2020 at the same resolution (UN-adjusted unconstrained). + +The country_code constructor parameter must match the ISO 3166-1 alpha-3 +code used in WorldPop file names (e.g. 'NOR', 'GHA', 'KEN'). +""" + +from __future__ import annotations + +import asyncio +import io +import logging +import math +from concurrent.futures import ThreadPoolExecutor +from typing import Any + +import numpy as np +import xarray as xr + +from climate_api.ingest.protocol import GridSpec + +logger = logging.getLogger(__name__) + +# WorldPop Global2 at 100m: 3 arc-seconds = 1/1200 degree per pixel +_WORLDPOP_RES_DEG = 1.0 / 1200 + +_executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="worldpop") + + +class WorldPopPlugin: + """IngestionPlugin for WorldPop yearly population count data. + + Args: + country_code: ISO 3166-1 alpha-3 country code (e.g. 'NOR', 'GHA'). + Must match the casing used in WorldPop file paths (stored as + upper-case for directory names, lower-case for filenames). + version: Dataset version — 'global2' (2015–2030, default) or + 'global1' (2000–2020). + """ + + max_concurrency = 1 + commit_batch_size = 1 + + def __init__(self, country_code: str, version: str = "global2") -> None: + self.country_code = country_code.upper() + self.version = version + + # ------------------------------------------------------------------ + # Protocol implementation + # ------------------------------------------------------------------ + + async def probe(self, bbox: list[float], **_: Any) -> GridSpec: + """Estimate grid spec from known WorldPop resolution — no data transfer.""" + return self._probe_estimate(bbox) + + async def periods(self, start: str, end: str) -> list[str]: + return self._build_periods(start, end) + + async def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: + return await asyncio.get_running_loop().run_in_executor( + _executor, self._fetch_sync, int(period_id), bbox + ) + + # ------------------------------------------------------------------ + # Sync helpers (run inside the thread pool) + # ------------------------------------------------------------------ + + def _url_for_year(self, year: int) -> str: + cc = self.country_code + if self.version == "global2": + filename = f"{cc.lower()}_pop_{year}_CN_100m_R2025A_v1.tif" + return ( + f"https://data.worldpop.org/GIS/Population/Global_2015_2030/R2025A/" + f"{year}/{cc}/v1/100m/constrained/{filename}" + ) + if self.version == "global1": + filename = f"{cc.lower()}_ppp_{year}_UNadj.tif" + return ( + f"https://data.worldpop.org/GIS/Population/Global_2000_2020/" + f"{year}/{cc}/{filename}" + ) + raise ValueError(f"Unknown WorldPop version: {self.version!r}") + + def _fetch_sync(self, year: int, bbox: list[float]) -> xr.Dataset: + """Download a per-country GeoTIFF, clip to bbox, return as Dataset.""" + import requests + import rioxarray + + url = self._url_for_year(year) + logger.info("Fetching WorldPop %s %d: %s", self.country_code, year, url) + resp = requests.get(url, timeout=300) + resp.raise_for_status() + + da = rioxarray.open_rasterio(io.BytesIO(resp.content)) + xmin, ymin, xmax, ymax = map(float, bbox) + da = da.rio.clip_box(minx=xmin, miny=ymin, maxx=xmax, maxy=ymax) + da = da.squeeze("band", drop=True) + da = da.load() + + ds = da.to_dataset(name="pop_total") + ds = ds.expand_dims(time=[np.datetime64(f"{year}-01-01", "D")]) + + for name in list(ds.data_vars) + list(ds.coords): + ds[name].encoding.clear() + ds["time"].encoding.update({"units": "days since 1970-01-01", "dtype": "int32"}) + return ds + + def _probe_estimate(self, bbox: list[float]) -> GridSpec: + """Derive GridSpec from WorldPop's known 3 arc-second resolution.""" + xmin, ymin, xmax, ymax = map(float, bbox) + nx = max(1, math.ceil((xmax - xmin) / _WORLDPOP_RES_DEG)) + ny = max(1, math.ceil((ymax - ymin) / _WORLDPOP_RES_DEG)) + return GridSpec( + shape=(ny, nx), + crs=4326, + dtype=np.dtype("float32"), + nodata=0.0, + time_dim=True, + ) + + # ------------------------------------------------------------------ + # Period generation + # ------------------------------------------------------------------ + + def _build_periods(self, start: str, end: str) -> list[str]: + """Return year strings in [start, end] clamped to version availability.""" + start_year = int(start[:4]) + end_year = int(end[:4]) + valid_range = (2015, 2030) if self.version == "global2" else (2000, 2020) + return [ + str(y) + for y in range(max(start_year, valid_range[0]), min(end_year, valid_range[1]) + 1) + ] diff --git a/tests/test_ingest_plugins.py b/tests/test_ingest_plugins.py new file mode 100644 index 00000000..e622696b --- /dev/null +++ b/tests/test_ingest_plugins.py @@ -0,0 +1,402 @@ +"""Unit tests for WorldPop and CHIRPS3 IngestionPlugins. + +All tests exercise the pure-Python logic (period generation, URL construction, +probe estimation) without making network calls. fetch_period tests use +monkeypatching to replace the network/rioxarray layer with a minimal stub. +""" + +from __future__ import annotations + +import asyncio +import io +from datetime import date +from typing import Any +from unittest.mock import MagicMock, patch + +import numpy as np +import pandas as pd +import pytest +import xarray as xr + +from climate_api.ingest.protocol import GridSpec, IngestionPlugin + + +# --------------------------------------------------------------------------- +# WorldPopPlugin +# --------------------------------------------------------------------------- + + +class TestWorldPopPlugin: + def _make_plugin(self, country_code: str = "NOR", version: str = "global2") -> Any: + from climate_api.ingest.plugins.worldpop import WorldPopPlugin + + return WorldPopPlugin(country_code=country_code, version=version) + + # Construction + + def test_country_code_uppercased(self) -> None: + plugin = self._make_plugin(country_code="nor") + assert plugin.country_code == "NOR" + + def test_satisfies_protocol(self) -> None: + plugin = self._make_plugin() + assert isinstance(plugin, IngestionPlugin) + + def test_max_concurrency_is_conservative(self) -> None: + plugin = self._make_plugin() + assert plugin.max_concurrency == 1 + + def test_commit_batch_size_is_one(self) -> None: + plugin = self._make_plugin() + assert plugin.commit_batch_size == 1 + + # URL construction + + def test_url_global2_structure(self) -> None: + from climate_api.ingest.plugins.worldpop import WorldPopPlugin + + plugin = WorldPopPlugin(country_code="NOR", version="global2") + url = plugin._url_for_year(2024) + assert "Global_2015_2030" in url + assert "/NOR/" in url + assert "nor_pop_2024" in url + assert url.endswith(".tif") + + def test_url_global1_structure(self) -> None: + from climate_api.ingest.plugins.worldpop import WorldPopPlugin + + plugin = WorldPopPlugin(country_code="GHA", version="global1") + url = plugin._url_for_year(2015) + assert "Global_2000_2020" in url + assert "/GHA/" in url + assert "gha_ppp_2015" in url + assert url.endswith(".tif") + + def test_url_unknown_version_raises(self) -> None: + from climate_api.ingest.plugins.worldpop import WorldPopPlugin + + plugin = WorldPopPlugin(country_code="NOR", version="badversion") + with pytest.raises(ValueError, match="Unknown WorldPop version"): + plugin._url_for_year(2020) + + # Period generation + + def test_build_periods_global2_basic(self) -> None: + plugin = self._make_plugin(version="global2") + periods = plugin._build_periods("2018", "2020") + assert periods == ["2018", "2019", "2020"] + + def test_build_periods_single_year(self) -> None: + plugin = self._make_plugin(version="global2") + assert plugin._build_periods("2023", "2023") == ["2023"] + + def test_build_periods_clamps_to_global2_range(self) -> None: + plugin = self._make_plugin(version="global2") + periods = plugin._build_periods("2010", "2035") + assert periods[0] == "2015" + assert periods[-1] == "2030" + + def test_build_periods_clamps_to_global1_range(self) -> None: + plugin = self._make_plugin(version="global1") + periods = plugin._build_periods("1995", "2025") + assert periods[0] == "2000" + assert periods[-1] == "2020" + + def test_build_periods_empty_when_out_of_range(self) -> None: + plugin = self._make_plugin(version="global2") + assert plugin._build_periods("2031", "2035") == [] + + def test_build_periods_uses_year_prefix_only(self) -> None: + # period strings like "2024-01-01" should be handled by stripping to year + plugin = self._make_plugin(version="global2") + periods = plugin._build_periods("2024-01-01", "2025-12-31") + assert periods == ["2024", "2025"] + + # probe / GridSpec + + def test_probe_estimate_returns_gridspec(self) -> None: + plugin = self._make_plugin() + spec = plugin._probe_estimate([4.0, 57.5, 31.5, 71.5]) + assert isinstance(spec, GridSpec) + assert spec.crs == 4326 + assert spec.time_dim is True + assert spec.dtype == np.dtype("float32") + assert spec.nodata == 0.0 + assert spec.shape[0] > 0 and spec.shape[1] > 0 + + def test_probe_estimate_shape_proportional_to_bbox(self) -> None: + plugin = self._make_plugin() + small = plugin._probe_estimate([0.0, 0.0, 1.0, 1.0]) + large = plugin._probe_estimate([0.0, 0.0, 10.0, 10.0]) + # 10x wider bbox should yield ~10x more columns + assert large.shape[1] > small.shape[1] * 5 + + def test_probe_is_async_and_returns_gridspec(self) -> None: + plugin = self._make_plugin() + + async def run() -> GridSpec: + return await plugin.probe([4.0, 57.5, 31.5, 71.5]) + + spec = asyncio.run(run()) + assert isinstance(spec, GridSpec) + + # fetch_period (mocked network) + + def _make_fake_da(self, ny: int = 4, nx: int = 5) -> Any: + """Build a minimal DataArray that mimics what rioxarray returns.""" + data = np.ones((1, ny, nx), dtype="float32") + y_coords = np.linspace(71.0, 57.5, ny) + x_coords = np.linspace(4.0, 31.0, nx) + da = xr.DataArray( + data, + dims=["band", "y", "x"], + coords={"band": [1], "y": y_coords, "x": x_coords}, + ) + da = da.rio.set_spatial_dims(x_dim="x", y_dim="y") + da = da.rio.write_crs("EPSG:4326") + return da + + def test_fetch_period_returns_dataset_with_time_and_pop_total(self) -> None: + from climate_api.ingest.plugins.worldpop import WorldPopPlugin + + fake_da = self._make_fake_da() + fake_resp = MagicMock() + fake_resp.raise_for_status = lambda: None + fake_resp.content = b"" + + with patch("requests.get", return_value=fake_resp), patch( + "rioxarray.open_rasterio", return_value=fake_da + ): + ds = WorldPopPlugin(country_code="NOR")._fetch_sync(2024, [4.0, 57.5, 31.5, 71.5]) + + assert "pop_total" in ds.data_vars + assert "time" in ds.dims + assert ds.sizes["time"] == 1 + time_val = pd.Timestamp(ds["time"].values[0]) + assert time_val.year == 2024 + + def test_fetch_period_clears_encoding_except_time(self) -> None: + from climate_api.ingest.plugins.worldpop import WorldPopPlugin + + fake_da = self._make_fake_da() + fake_resp = MagicMock() + fake_resp.raise_for_status = lambda: None + fake_resp.content = b"" + + with patch("requests.get", return_value=fake_resp), patch( + "rioxarray.open_rasterio", return_value=fake_da + ): + ds = WorldPopPlugin(country_code="NOR")._fetch_sync(2024, [4.0, 57.5, 31.5, 71.5]) + + assert ds["time"].encoding.get("units") == "days since 1970-01-01" + + +# --------------------------------------------------------------------------- +# Chirps3Plugin +# --------------------------------------------------------------------------- + + +class TestChirps3Plugin: + def _make_plugin(self, stage: str = "final", flavor: str = "rnl") -> Any: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + return Chirps3Plugin(stage=stage, flavor=flavor) + + # Construction + + def test_default_stage_and_flavor(self) -> None: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + plugin = Chirps3Plugin() + assert plugin.stage == "final" + assert plugin.flavor == "rnl" + + def test_satisfies_protocol(self) -> None: + plugin = self._make_plugin() + assert isinstance(plugin, IngestionPlugin) + + def test_max_concurrency(self) -> None: + assert self._make_plugin().max_concurrency == 4 + + def test_commit_batch_size(self) -> None: + assert self._make_plugin().commit_batch_size == 30 + + def test_rechunk_time_declared(self) -> None: + assert self._make_plugin().rechunk_time == 30 + + def test_invalid_stage_raises(self) -> None: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + with pytest.raises(ValueError, match="stage"): + Chirps3Plugin(stage="bad") + + def test_invalid_flavor_for_final_raises(self) -> None: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + with pytest.raises(ValueError, match="flavor"): + Chirps3Plugin(stage="final", flavor="bad") + + def test_invalid_flavor_for_prelim_raises(self) -> None: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + with pytest.raises(ValueError, match="flavor"): + Chirps3Plugin(stage="prelim", flavor="rnl") + + # URL construction + + def test_url_final_rnl_structure(self) -> None: + plugin = self._make_plugin(stage="final", flavor="rnl") + url = plugin._url_for_day(date(2024, 3, 15)) + assert "final/rnl/cogs/2024" in url + assert "chirps-v3.0.rnl.2024.03.15.cog" in url + + def test_url_final_sat_structure(self) -> None: + plugin = self._make_plugin(stage="final", flavor="sat") + url = plugin._url_for_day(date(2024, 1, 1)) + assert "final/sat/cogs/2024" in url + assert "chirps-v3.0.sat.2024.01.01.cog" in url + + def test_url_prelim_structure(self) -> None: + plugin = self._make_plugin(stage="prelim", flavor="sat") + url = plugin._url_for_day(date(2024, 11, 5)) + assert "prelim/sat/2024" in url + assert "chirps-v3.0.prelim.2024.11.05.tif" in url + + # Period generation + + def test_build_periods_returns_daily_dates(self) -> None: + plugin = self._make_plugin() + # Use a fixed cutoff by patching today + with patch("climate_api.ingest.plugins.chirps3.date") as mock_date: + mock_date.today.return_value = date(2024, 3, 25) # day > 20 → cutoff = end of Feb + mock_date.fromisoformat = date.fromisoformat + mock_date.side_effect = date + periods = plugin._build_periods("2024-02-01", "2024-03-31") + # Cutoff: end of February 2024 (29 days — 2024 is leap) + assert periods[0] == "2024-02-01" + assert periods[-1] == "2024-02-29" + assert len(periods) == 29 + + def test_build_periods_respects_lag_before_threshold_day(self) -> None: + plugin = self._make_plugin() + with patch("climate_api.ingest.plugins.chirps3.date") as mock_date: + mock_date.today.return_value = date(2024, 3, 10) # day <= 20 → cutoff = end of Jan + mock_date.fromisoformat = date.fromisoformat + mock_date.side_effect = date + periods = plugin._build_periods("2024-01-01", "2024-03-31") + assert periods[-1] == "2024-01-31" + + def test_build_periods_empty_when_start_after_cutoff(self) -> None: + plugin = self._make_plugin() + with patch("climate_api.ingest.plugins.chirps3.date") as mock_date: + mock_date.today.return_value = date(2024, 3, 25) + mock_date.fromisoformat = date.fromisoformat + mock_date.side_effect = date + periods = plugin._build_periods("2024-03-01", "2024-03-31") + assert periods == [] + + def test_build_periods_consecutive(self) -> None: + plugin = self._make_plugin() + with patch("climate_api.ingest.plugins.chirps3.date") as mock_date: + mock_date.today.return_value = date(2024, 4, 25) + mock_date.fromisoformat = date.fromisoformat + mock_date.side_effect = date + periods = plugin._build_periods("2024-03-01", "2024-03-05") + assert periods == ["2024-03-01", "2024-03-02", "2024-03-03", "2024-03-04", "2024-03-05"] + + def test_build_periods_single_day(self) -> None: + plugin = self._make_plugin() + with patch("climate_api.ingest.plugins.chirps3.date") as mock_date: + mock_date.today.return_value = date(2024, 4, 25) + mock_date.fromisoformat = date.fromisoformat + mock_date.side_effect = date + periods = plugin._build_periods("2024-03-01", "2024-03-01") + assert periods == ["2024-03-01"] + + def test_build_periods_spans_months(self) -> None: + plugin = self._make_plugin() + with patch("climate_api.ingest.plugins.chirps3.date") as mock_date: + mock_date.today.return_value = date(2024, 5, 25) + mock_date.fromisoformat = date.fromisoformat + mock_date.side_effect = date + periods = plugin._build_periods("2024-03-30", "2024-04-02") + assert periods == ["2024-03-30", "2024-03-31", "2024-04-01", "2024-04-02"] + + # probe / GridSpec + + def test_probe_estimate_returns_gridspec(self) -> None: + plugin = self._make_plugin() + spec = plugin._probe_estimate([-180.0, -50.0, 180.0, 50.0]) + assert isinstance(spec, GridSpec) + assert spec.crs == 4326 + assert spec.time_dim is True + assert spec.dtype == np.dtype("float32") + assert spec.nodata == -9999.0 + assert spec.shape[0] > 0 and spec.shape[1] > 0 + + def test_probe_estimate_shape_matches_chirps3_global_extent(self) -> None: + plugin = self._make_plugin() + # CHIRPS3 full extent: 360° × 100° at 0.05° → 7200 × 2000 + spec = plugin._probe_estimate([-180.0, -50.0, 180.0, 50.0]) + assert spec.shape == (2000, 7200) + + def test_probe_is_async_and_returns_gridspec(self) -> None: + plugin = self._make_plugin() + + async def run() -> GridSpec: + return await plugin.probe([-180.0, -50.0, 180.0, 50.0]) + + spec = asyncio.run(run()) + assert isinstance(spec, GridSpec) + + # fetch_period (mocked network) + + def _make_fake_chirps_da(self, ny: int = 4, nx: int = 5) -> Any: + data = np.ones((1, ny, nx), dtype="float32") * 5.0 + y_coords = np.linspace(10.0, 5.0, ny) + x_coords = np.linspace(-5.0, 5.0, nx) + da = xr.DataArray( + data, + dims=["band", "y", "x"], + coords={"band": [1], "y": y_coords, "x": x_coords}, + ) + da = da.rio.set_spatial_dims(x_dim="x", y_dim="y") + da = da.rio.write_crs("EPSG:4326") + return da + + def test_fetch_period_returns_dataset_with_time_and_precip(self) -> None: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + fake_da = self._make_fake_chirps_da() + with patch("rioxarray.open_rasterio", return_value=fake_da): + ds = Chirps3Plugin()._fetch_sync("2024-03-15", [-5.0, 5.0, 5.0, 10.0]) + + assert "precip" in ds.data_vars + assert "time" in ds.dims + assert ds.sizes["time"] == 1 + time_val = pd.Timestamp(ds["time"].values[0]) + assert time_val == pd.Timestamp("2024-03-15") + + def test_fetch_period_masks_nodata_as_nan(self) -> None: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + data = np.array([[[1.0, -9999.0], [3.0, 4.0]]], dtype="float32") + da = xr.DataArray(data, dims=["band", "y", "x"], coords={"band": [1], "y": [2.0, 1.0], "x": [0.0, 1.0]}) + da = da.rio.set_spatial_dims(x_dim="x", y_dim="y") + da = da.rio.write_crs("EPSG:4326") + + with patch("rioxarray.open_rasterio", return_value=da): + ds = Chirps3Plugin()._fetch_sync("2024-01-01", [0.0, 1.0, 1.0, 2.0]) + + precip = ds["precip"].values + assert np.isnan(precip).any(), "nodata pixels should be NaN" + assert not np.isnan(precip).all(), "non-nodata pixels should be finite" + + def test_fetch_period_time_encoding_pinned(self) -> None: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + fake_da = self._make_fake_chirps_da() + with patch("rioxarray.open_rasterio", return_value=fake_da): + ds = Chirps3Plugin()._fetch_sync("2024-03-15", [-5.0, 5.0, 5.0, 10.0]) + + assert ds["time"].encoding.get("units") == "days since 1970-01-01" From a841eb701f777884e7f821f6dae2b2080c36274e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 17:50:07 +0200 Subject: [PATCH 08/80] feat: Icechunk support in data accessor, zarr-serving, STAC and resample MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit open_icechunk_dataset opens an Icechunk store via a readonly MVCC session and is wired into every place open_zarr_dataset is used: the publications provider axes, STAC collection builder, and resample source loader. The zarr-serving HTTP routes handle ArtifactFormat.ICECHUNK by opening the store via repo.readonly_session("main") and serving all zarr keys through session.store[key] — Icechunk's virtual zarr v3 interface. Directory-style paths enumerate group members via zarr.open_group. Metadata keys are served as JSON; chunk keys as raw bytes. STAC link generation and resample source validation both recognise ICECHUNK alongside ZARR. pygeoapi registration is skipped for Icechunk artifacts (pygeoapi reads plain zarr directories, not Icechunk repos). --- .../data_accessor/services/accessor.py | 10 ++ climate_api/ingestions/services.py | 105 ++++++++++++++++- climate_api/processing/resample.py | 12 +- climate_api/publications/services.py | 13 ++- climate_api/stac/services.py | 10 +- tests/test_datasets.py | 106 ++++++++++++++++++ tests/test_processing_resample.py | 85 ++++++++++++++ tests/test_stac.py | 58 ++++++++++ 8 files changed, 387 insertions(+), 12 deletions(-) diff --git a/climate_api/data_accessor/services/accessor.py b/climate_api/data_accessor/services/accessor.py index b770791d..e2ac9786 100644 --- a/climate_api/data_accessor/services/accessor.py +++ b/climate_api/data_accessor/services/accessor.py @@ -3,6 +3,7 @@ import logging import os import tempfile +from pathlib import Path from typing import Any import numpy as np @@ -124,6 +125,15 @@ def open_zarr_dataset(zarr_path: str) -> xr.Dataset: return ds +def open_icechunk_dataset(store_path: str | Path) -> xr.Dataset: + """Open an Icechunk store as an xarray Dataset via a readonly MVCC session.""" + from climate_api.ingest.store import open_or_create_repo + + repo = open_or_create_repo(Path(store_path)) + session = repo.readonly_session("main") + return xr.open_zarr(session.store) + + def _open_zarr(zarr_path: str) -> xr.Dataset: """Open a zarr store with automatic consolidated metadata detection.""" return xr.open_zarr(zarr_path, consolidated=None) # type: ignore[no-any-return] diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 19c42ea7..0522a405 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -582,6 +582,10 @@ def plan_sync_dataset( def get_dataset_zarr_store_info_or_404(dataset_id: str) -> dict[str, object]: """Return a public Zarr store listing for a managed dataset.""" artifact = get_latest_artifact_for_dataset_or_404(dataset_id) + + if artifact.format == ArtifactFormat.ICECHUNK: + return _icechunk_store_info(dataset_id, artifact) + store_root = _get_zarr_root_or_409(artifact) entries = _zarr_entries(dataset_id=dataset_id, store_root=store_root, directory=store_root) @@ -600,6 +604,51 @@ def get_dataset_zarr_store_info_or_404(dataset_id: str) -> dict[str, object]: } +def _icechunk_store_info(dataset_id: str, artifact: ArtifactRecord) -> dict[str, object]: + """Return a Zarr store listing for an Icechunk-backed artifact.""" + import zarr + + from climate_api.ingest.store import open_or_create_repo + + store_path = Path(artifact.path or artifact.asset_paths[0]) + if not store_path.exists(): + raise HTTPException(status_code=404, detail="Icechunk store not found on disk") + + repo = open_or_create_repo(store_path) + session = repo.readonly_session("main") + + store_attrs: dict[str, object] = {} + try: + root_meta = json.loads(bytes(session.store["zarr.json"])) # type: ignore[index] + store_attrs = root_meta.get("attributes", {}) # type: ignore[assignment] + except Exception: + pass + + store_crs = store_attrs.get("proj:code") + crs = store_crs if isinstance(store_crs, str) and store_crs else api_config.get_crs() + + root: zarr.Group = zarr.open_group(session.store, mode="r") # type: ignore[assignment] + entries = [ + { + "name": name, + "kind": "directory", + "href": f"/zarr/{dataset_id}/{name}", + } + for name in sorted(root.keys()) + ] + + return { + "kind": "ZarrListing", + "dataset_id": dataset_id, + "format": artifact.format, + "path": ".", + "crs": crs, + "proj4": _crs_to_proj4(crs), + "bounds": _read_zarr_bounds(store_attrs), + "entries": entries, + } + + def _crs_to_proj4(crs: str) -> str | None: """Convert an EPSG code or WKT string to a proj4 definition string, or None on failure.""" import warnings @@ -651,6 +700,10 @@ def get_dataset_zarr_store_file_or_404( ) -> FileResponse | Response | dict[str, object]: """Serve a file, metadata document, or directory listing within a dataset Zarr store.""" artifact = get_latest_artifact_for_dataset_or_404(dataset_id) + + if artifact.format == ArtifactFormat.ICECHUNK: + return _serve_icechunk_key(dataset_id, artifact, relative_path) + store_root = _get_zarr_root_or_409(artifact) target = _resolve_zarr_path(store_root, relative_path) if not target.exists(): @@ -666,6 +719,55 @@ def get_dataset_zarr_store_file_or_404( return FileResponse(target, media_type=media_type, filename=target.name) +def _serve_icechunk_key( + dataset_id: str, artifact: ArtifactRecord, relative_path: str +) -> Response | dict[str, object]: + """Serve a zarr v3 key from an Icechunk store via its session store.""" + import zarr + + from climate_api.ingest.store import open_or_create_repo + + store_path = Path(artifact.path or artifact.asset_paths[0]) + if not store_path.exists(): + raise HTTPException(status_code=404, detail="Icechunk store not found on disk") + + repo = open_or_create_repo(store_path) + session = repo.readonly_session("main") + key = relative_path.lstrip("/") + + # Directory-like paths: list child keys as a ZarrListing + if not key or key.endswith("/"): + root: zarr.Group = zarr.open_group(session.store, mode="r") # type: ignore[assignment] + prefix = key.rstrip("/") + try: + node: zarr.Group = root[prefix] if prefix else root # type: ignore[assignment] + except KeyError: + raise HTTPException(status_code=404, detail=f"Zarr path '{relative_path}' not found") + entries = [ + { + "name": name, + "kind": "directory", + "href": f"/zarr/{dataset_id}/{prefix}/{name}".replace("//", "/"), + } + for name in sorted(node.keys()) + ] + return { + "kind": "ZarrListing", + "dataset_id": dataset_id, + "path": key or ".", + "entries": entries, + } + + try: + data: bytes = bytes(session.store[key]) # type: ignore[index] + except KeyError: + raise HTTPException(status_code=404, detail=f"Zarr key '{relative_path}' not found in store") + + if key.endswith("zarr.json"): + return JSONResponse(content=json.loads(data)) + return Response(content=data, media_type="application/octet-stream") + + def _load_records() -> list[ArtifactRecord]: ensure_store() raw = json.loads(ARTIFACTS_INDEX_PATH.read_text(encoding="utf-8")) @@ -1010,7 +1112,8 @@ def _dataset_links(dataset_id: str, latest: ArtifactRecord) -> list[DatasetAcces DatasetAccessLink(href=f"/datasets/{dataset_id}", rel="self", title="Dataset detail"), DatasetAccessLink(href=f"/zarr/{dataset_id}", rel="zarr", title="Zarr store"), ] - if latest.publication.status == PublicationStatus.PUBLISHED and latest.format == ArtifactFormat.ZARR: + zarr_formats = {ArtifactFormat.ZARR, ArtifactFormat.ICECHUNK} + if latest.publication.status == PublicationStatus.PUBLISHED and latest.format in zarr_formats: links.append(DatasetAccessLink(href=f"/stac/collections/{dataset_id}", rel="stac", title="STAC collection")) if latest.format == ArtifactFormat.NETCDF: links.append( diff --git a/climate_api/processing/resample.py b/climate_api/processing/resample.py index 62970b1e..ed3bc814 100644 --- a/climate_api/processing/resample.py +++ b/climate_api/processing/resample.py @@ -15,7 +15,7 @@ import xarray as xr from fastapi import HTTPException -from climate_api.data_accessor.services.accessor import open_zarr_dataset +from climate_api.data_accessor.services.accessor import open_icechunk_dataset, open_zarr_dataset from climate_api.data_manager.services.utils import get_time_dim from climate_api.data_registry.services import datasets as registry_datasets from climate_api.ingestions import services as ingestion_services @@ -88,13 +88,17 @@ def materialize_resampled_artifact( raise HTTPException(status_code=404, detail=f"Source dataset template '{source_dataset_id}' not found") source_artifact = _resolve_source_artifact(source_dataset_id=source_dataset_id) - if source_artifact.format != ArtifactFormat.ZARR: - raise HTTPException(status_code=409, detail="Resampling currently requires a Zarr-backed source artifact") + if source_artifact.format not in {ArtifactFormat.ZARR, ArtifactFormat.ICECHUNK}: + raise HTTPException(status_code=409, detail="Resampling currently requires a Zarr or Icechunk source artifact") target_managed_dataset_id = managed_dataset_id_for_scope(target_dataset_id) zarr_path = DERIVED_DATA_DIR / f"{target_managed_dataset_id}.zarr" - source_ds = open_zarr_dataset(source_artifact.path or source_artifact.asset_paths[0]) + source_path = source_artifact.path or source_artifact.asset_paths[0] + if source_artifact.format == ArtifactFormat.ICECHUNK: + source_ds = open_icechunk_dataset(source_path) + else: + source_ds = open_zarr_dataset(source_path) try: resampled = _resample_dataset( source_ds=source_ds, diff --git a/climate_api/publications/services.py b/climate_api/publications/services.py index 6451e177..34b32639 100644 --- a/climate_api/publications/services.py +++ b/climate_api/publications/services.py @@ -12,7 +12,7 @@ import xarray as xr import yaml -from climate_api.data_accessor.services.accessor import open_zarr_dataset +from climate_api.data_accessor.services.accessor import open_icechunk_dataset, open_zarr_dataset from climate_api.data_manager.services.utils import get_time_dim, get_x_y_dims from climate_api.ingestions.schemas import ArtifactFormat, ArtifactRecord, PublicationStatus @@ -49,6 +49,7 @@ def publish_artifact(record: ArtifactRecord) -> ArtifactRecord: collection_id = managed_dataset_id_for(record) data_path = record.path or record.asset_paths[0] is_pyramid_zarr = record.format == ArtifactFormat.ZARR and (Path(data_path) / "0").is_dir() + is_icechunk = record.format == ArtifactFormat.ICECHUNK published_record = record.model_copy( update={ "publication": record.publication.model_copy( @@ -56,8 +57,8 @@ def publish_artifact(record: ArtifactRecord) -> ArtifactRecord: "status": PublicationStatus.PUBLISHED, "collection_id": collection_id, "published_at": datetime.now(UTC), - # Pyramid zarr stores are served via the /zarr endpoint, not pygeoapi. - "pygeoapi_path": None if is_pyramid_zarr else f"/ogcapi/collections/{collection_id}", + # Pyramid zarr and Icechunk stores are served via the /zarr endpoint, not pygeoapi. + "pygeoapi_path": None if (is_pyramid_zarr or is_icechunk) else f"/ogcapi/collections/{collection_id}", } ) } @@ -69,6 +70,8 @@ def publish_artifact(record: ArtifactRecord) -> ArtifactRecord: if active.publication.status != PublicationStatus.PUBLISHED: continue data_path = active.path or active.asset_paths[0] + if active.format == ArtifactFormat.ICECHUNK: + continue # icechunk: not served via pygeoapi, use /zarr endpoint instead if active.format == ArtifactFormat.ZARR and (Path(data_path) / "0").is_dir(): continue # pyramid zarr: not served via pygeoapi, use /zarr endpoint instead assert active.publication.collection_id is not None @@ -160,7 +163,9 @@ def _provider_format(artifact_format: ArtifactFormat) -> dict[str, str]: def _provider_axes(record: ArtifactRecord) -> tuple[str, str, str]: """Inspect an artifact and return provider axis field names.""" data_path = record.path or record.asset_paths[0] - if record.format == ArtifactFormat.ZARR: + if record.format == ArtifactFormat.ICECHUNK: + ds = open_icechunk_dataset(data_path) + elif record.format == ArtifactFormat.ZARR: ds = open_zarr_dataset(data_path) else: ds = xr.open_dataset(data_path) diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index c8801196..022ed8d9 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -13,7 +13,7 @@ from fastapi import HTTPException, Request from xstac import xarray_to_stac -from climate_api.data_accessor.services.accessor import open_zarr_dataset +from climate_api.data_accessor.services.accessor import open_icechunk_dataset, open_zarr_dataset from climate_api.data_manager.services.utils import get_time_dim, get_x_y_dims from climate_api.data_registry.services import datasets as registry_datasets from climate_api.ingestions import services as ingestion_services @@ -131,7 +131,7 @@ def _eligible_artifacts_by_dataset() -> dict[str, ArtifactRecord]: latest = max(artifacts, key=lambda artifact: artifact.created_at) if latest.publication.status != PublicationStatus.PUBLISHED: continue - if latest.format != ArtifactFormat.ZARR: + if latest.format not in (ArtifactFormat.ZARR, ArtifactFormat.ICECHUNK): continue result[dataset_id] = latest return dict(sorted(result.items())) @@ -196,7 +196,11 @@ def _build_collection_with_xstac(*, artifact: ArtifactRecord, template: pystac.C return deepcopy(cached_payload) try: - ds = open_zarr_dataset(_artifact_store_path(artifact)) + store_path = _artifact_store_path(artifact) + if artifact.format == ArtifactFormat.ICECHUNK: + ds = open_icechunk_dataset(store_path) + else: + ds = open_zarr_dataset(store_path) except HTTPException: raise except Exception as exc: diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 7ece9ee2..a68e55cd 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -125,6 +125,15 @@ def test_dataset_links_include_stac_for_published_zarr() -> None: assert any(link.rel == "stac" and link.href == "/stac/collections/chirps3_precipitation_daily" for link in links) +def test_dataset_links_include_stac_for_published_icechunk() -> None: + artifact = _artifact(artifact_id="a1") + artifact = artifact.model_copy(update={"format": ArtifactFormat.ICECHUNK}) + + links = services._dataset_links("chirps3_precipitation_daily", artifact) + + assert any(link.rel == "stac" and link.href == "/stac/collections/chirps3_precipitation_daily" for link in links) + + def test_dataset_links_omit_stac_for_unpublished_or_netcdf() -> None: unpublished = _artifact(artifact_id="a1") unpublished.publication.status = PublicationStatus.UNPUBLISHED @@ -835,3 +844,100 @@ def test_create_artifact_delta_rejects_short_rebuilt_coverage( assert exc_info.value.status_code == 409 assert "coverage=2026-02-01..2026-02-10" in str(exc_info.value.detail) assert "request=2026-01-01..2026-02-10" in str(exc_info.value.detail) + + +def _icechunk_artifact( + *, + artifact_id: str = "ic1", + path: str = "/tmp/test.icechunk", +) -> ArtifactRecord: + return ArtifactRecord( + artifact_id=artifact_id, + dataset_id="chirps3_precipitation_daily", + dataset_name="CHIRPS3 precipitation", + variable="precip", + format=ArtifactFormat.ICECHUNK, + path=path, + asset_paths=[path], + variables=["precip"], + request_scope=ArtifactRequestScope( + start="2026-01-01", + end="2026-01-10", + bbox=(1.0, 2.0, 3.0, 4.0), + ), + coverage=ArtifactCoverage( + temporal=CoverageTemporal(start="2026-01-01", end="2026-01-10"), + spatial=CoverageSpatial(xmin=1.0, ymin=2.0, xmax=3.0, ymax=4.0), + ), + created_at=datetime(2026, 1, 10, tzinfo=UTC), + publication=ArtifactPublication(status=PublicationStatus.PUBLISHED), + ) + + +def test_get_zarr_store_info_dispatches_to_icechunk_for_icechunk_artifact( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + store_path = tmp_path / "test.icechunk" + store_path.mkdir() + artifact = _icechunk_artifact(path=str(store_path)) + + monkeypatch.setattr( + services, + "get_latest_artifact_for_dataset_or_404", + lambda _: artifact, + ) + + called_with: list[str] = [] + + def fake_icechunk_store_info(dataset_id: str, art: ArtifactRecord) -> dict: + called_with.append(dataset_id) + return {"kind": "ZarrListing", "dataset_id": dataset_id, "format": art.format, "entries": []} + + monkeypatch.setattr(services, "_icechunk_store_info", fake_icechunk_store_info) + + result = services.get_dataset_zarr_store_info_or_404("chirps3_precipitation_daily") + + assert result["kind"] == "ZarrListing" + assert called_with == ["chirps3_precipitation_daily"] + + +def test_get_zarr_store_file_dispatches_to_icechunk_for_icechunk_artifact( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + store_path = tmp_path / "test.icechunk" + store_path.mkdir() + artifact = _icechunk_artifact(path=str(store_path)) + + monkeypatch.setattr( + services, + "get_latest_artifact_for_dataset_or_404", + lambda _: artifact, + ) + + served_keys: list[str] = [] + + from starlette.responses import Response + + def fake_serve_icechunk_key(dataset_id: str, art: ArtifactRecord, relative_path: str) -> Response: + served_keys.append(relative_path) + return Response(content=b'{"zarr_format": 3}', media_type="application/json") + + monkeypatch.setattr(services, "_serve_icechunk_key", fake_serve_icechunk_key) + + services.get_dataset_zarr_store_file_or_404("chirps3_precipitation_daily", "t2m/zarr.json") + + assert served_keys == ["t2m/zarr.json"] + + +def test_get_zarr_store_info_raises_409_for_netcdf_artifact(monkeypatch: pytest.MonkeyPatch) -> None: + netcdf = _artifact(artifact_id="a1") + netcdf = netcdf.model_copy(update={"format": ArtifactFormat.NETCDF}) + + monkeypatch.setattr(services, "get_latest_artifact_for_dataset_or_404", lambda _: netcdf) + + with pytest.raises(services.HTTPException) as exc_info: + services.get_dataset_zarr_store_info_or_404("chirps3_precipitation_daily") + + assert exc_info.value.status_code == 409 diff --git a/tests/test_processing_resample.py b/tests/test_processing_resample.py index 1aa1c912..5374fe2a 100644 --- a/tests/test_processing_resample.py +++ b/tests/test_processing_resample.py @@ -184,6 +184,91 @@ def test_materialize_resampled_artifact_returns_404_when_source_dataset_template ) +def test_materialize_resampled_artifact_returns_409_when_source_is_netcdf( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + source_artifact = _artifact( + artifact_id="source-netcdf", + dataset_id="era5land_temperature_hourly", + managed_dataset_id="era5land_temperature_hourly_sle", + path=tmp_path / "source.nc", + start="2026-01-01", + end="2026-01-02", + ) + source_artifact = source_artifact.model_copy(update={"format": ArtifactFormat.NETCDF}) + + monkeypatch.setattr( + resample.registry_datasets, + "get_dataset", + lambda dataset_id: {"id": dataset_id, "period_type": "daily"}, + ) + monkeypatch.setattr( + resample.ingestion_services, + "get_latest_artifact_for_dataset_or_404", + lambda _: source_artifact, + ) + + with pytest.raises(resample.HTTPException, match="Zarr or Icechunk"): + resample.materialize_resampled_artifact( + source_dataset_id="era5land_temperature_hourly", + frequency="1D", + method="mean", + start="2026-01-01", + end="2026-01-02", + overwrite=False, + publish=False, + ) + + +def test_materialize_resampled_artifact_reads_icechunk_source( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + source_path = tmp_path / "source.icechunk" + source_artifact = _artifact( + artifact_id="source-icechunk", + dataset_id="era5land_temperature_hourly", + managed_dataset_id="era5land_temperature_hourly_sle", + path=source_path, + start="2026-01-01T00", + end="2026-01-02T23", + ) + source_artifact = source_artifact.model_copy(update={"format": ArtifactFormat.ICECHUNK}) + + time = np.array("2026-01-01T00", dtype="datetime64[h]") + np.arange(48) + ds = xr.Dataset( + {"value": (("time", "lat", "lon"), np.arange(48, dtype=float).reshape(48, 1, 1))}, + coords={"time": time, "lat": [2.0], "lon": [1.0]}, + ) + + monkeypatch.setattr( + resample.registry_datasets, + "get_dataset", + lambda dataset_id: {"id": dataset_id, "period_type": "hourly"} if "hourly" in dataset_id else None, + ) + monkeypatch.setattr( + resample.ingestion_services, + "get_latest_artifact_for_dataset_or_404", + lambda _: source_artifact, + ) + monkeypatch.setattr(resample, "open_icechunk_dataset", lambda _: ds) + + artifact = resample.materialize_resampled_artifact( + source_dataset_id="era5land_temperature_hourly", + frequency="1D", + method="mean", + start="2026-01-01", + end="2026-01-02", + overwrite=False, + publish=False, + ) + + assert artifact.dataset_id == "era5land_temperature_hourly_1d_mean" + assert artifact.coverage.temporal.start == "2026-01-01" + assert artifact.coverage.temporal.end == "2026-01-02" + + def test_materialize_resampled_artifact_drops_incomplete_trailing_week( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, diff --git a/tests/test_stac.py b/tests/test_stac.py index 057170f3..9084b136 100644 --- a/tests/test_stac.py +++ b/tests/test_stac.py @@ -119,6 +119,64 @@ def test_catalog_excludes_unpublished_and_netcdf(client: TestClient, monkeypatch assert [link for link in payload["links"] if link["rel"] == "child"] == [] +def test_catalog_includes_published_icechunk_artifact(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + ingestion_services, + "list_artifacts", + lambda: SimpleNamespace(items=[_artifact(artifact_id="a1", format=ArtifactFormat.ICECHUNK)]), + ) + monkeypatch.setattr(stac_services.registry_datasets, "get_dataset", lambda _: {"period_type": "daily"}) + monkeypatch.setattr(stac_services, "open_icechunk_dataset", lambda _: SimpleNamespace(close=lambda: None)) + monkeypatch.setattr(stac_services, "get_x_y_dims", lambda _: ("x", "y")) + monkeypatch.setattr(stac_services, "get_time_dim", lambda _: "time") + monkeypatch.setattr(stac_services, "xarray_to_stac", lambda ds, template, **kw: template) + + response = client.get("/stac/catalog.json") + + assert response.status_code == 200 + child_links = [link for link in response.json()["links"] if link["rel"] == "child"] + assert len(child_links) == 1 + assert "chirps3_precipitation_daily" in child_links[0]["href"] + + +def test_collection_uses_icechunk_dataset_for_icechunk_artifact( + monkeypatch: pytest.MonkeyPatch, +) -> None: + artifact = _artifact(artifact_id="a1", format=ArtifactFormat.ICECHUNK) + + class DummyDataset: + def close(self) -> None: + pass + + opened: list[str] = [] + + def fake_open_icechunk(path: str) -> DummyDataset: + opened.append(path) + return DummyDataset() + + template = pystac.Collection( + id="chirps3_precipitation_daily", + description="template", + extent=pystac.Extent( + spatial=pystac.SpatialExtent([[1.0, 2.0, 3.0, 4.0]]), + temporal=pystac.TemporalExtent([[datetime(2026, 1, 1, tzinfo=UTC), datetime(2026, 1, 10, tzinfo=UTC)]]), + ), + title="CHIRPS3 precipitation", + license="proprietary", + ) + template.add_asset("zarr", pystac.Asset(href="http://example.test/zarr")) + monkeypatch.setattr(stac_services, "open_icechunk_dataset", fake_open_icechunk) + monkeypatch.setattr(stac_services, "get_x_y_dims", lambda _: ("x", "y")) + monkeypatch.setattr(stac_services, "get_time_dim", lambda _: "time") + monkeypatch.setattr(stac_services, "xarray_to_stac", lambda *args, **kwargs: template) + + payload = stac_services._build_collection_with_xstac(artifact=artifact, template=template) + + assert payload["type"] == "Collection" + assert len(opened) == 1 + assert opened[0] == "/tmp/chirps3_precipitation_daily.zarr" + + def test_collection_uses_xstac_and_adds_expected_fields(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr( ingestion_services, From bd713c7be596a3b589783b20dabb706aa8c81ef6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 17:50:19 +0200 Subject: [PATCH 09/80] feat: GridSpec.extra_dims and expire_snapshots after ingest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GridSpec gains an extra_dims field for plugins that produce multidimensional stores (e.g. WorldPop age/sex with dims {age_group: 20, sex: 2}). The orchestrator does not act on extra_dims yet; the field exposes the intent to protocol users and prepares for future chunk-shape computation. The orchestrator calls repo.expire_snapshots(older_than=now) after the ingest loop (and rechunk pass) completes. Each period write creates one Icechunk snapshot; for a 35-year daily ingest this produces ~12,775 intermediate snapshots. expire_snapshots marks them as expired without touching chunk data — the "main" branch ref preserves HEAD. Run garbage_collect separately to reclaim manifest storage. The repo is reopened after rechunk_store() so expire_snapshots operates on the post-rechunk HEAD rather than the stale pre-rechunk repo handle. --- climate_api/ingest/orchestrator.py | 15 +++++++++++++++ climate_api/ingest/protocol.py | 6 ++++++ 2 files changed, 21 insertions(+) diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py index bae713b3..7b4ec08d 100644 --- a/climate_api/ingest/orchestrator.py +++ b/climate_api/ingest/orchestrator.py @@ -16,6 +16,7 @@ import importlib import logging from collections.abc import Callable +from datetime import datetime, timezone from pathlib import Path from typing import Any @@ -158,6 +159,20 @@ async def _fetch(period_id: str) -> xr.Dataset: if rechunk_time is not None and spec.time_dim: logger.info("Rechunking %s after ingest: time chunk → %d", store_path, rechunk_time) rechunk_store(store_path, time_chunk=rechunk_time) + # Reopen repo so expire_snapshots sees the post-rechunk HEAD. + repo = open_or_create_repo(store_path) + + # Prune intermediate ingest snapshots: each period commit created one + # snapshot; only the final state (HEAD of "main") needs to be retained. + # expire_snapshots marks older snapshots as expired without deleting chunk + # data — garbage_collect would be needed to reclaim manifest storage. + # The "main" branch ref preserves HEAD even when it appears in the expired set. + try: + expired = repo.expire_snapshots(older_than=datetime.now(tz=timezone.utc)) + if expired: + logger.info("Expired %d intermediate snapshots from %s", len(expired), store_path) + except Exception: + logger.warning("expire_snapshots failed for %s — store remains valid", store_path, exc_info=True) def run_ingest_sync( diff --git a/climate_api/ingest/protocol.py b/climate_api/ingest/protocol.py index 42ecdee2..8f5e109b 100644 --- a/climate_api/ingest/protocol.py +++ b/climate_api/ingest/protocol.py @@ -19,6 +19,11 @@ class GridSpec: attributes before the first period is written. Set time_dim=False for static (time-invariant) datasets — the orchestrator branches on this flag and issues a single write with no append dimension. + + extra_dims: optional non-spatial, non-time dimensions in the store, e.g. + {"age_group": 20, "sex": 2}. The orchestrator does not use this field; + it exists for plugin authors who need to document multidimensional + stores and for future orchestrator extensions. """ shape: tuple[int, int] @@ -29,6 +34,7 @@ class GridSpec: x_dim: str = "x" y_dim: str = "y" attrs: dict[str, Any] = field(default_factory=dict) + extra_dims: dict[str, int] = field(default_factory=dict) @runtime_checkable From 3cdba2cd9f6b56f5f73489d1a80f9c6d6e650c45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 17:59:41 +0200 Subject: [PATCH 10/80] docs: update for IngestionPlugin, Icechunk ingest, and built-in plugins extensibility.md: add IngestionPlugin as a first-class extension point with full protocol reference, GridSpec field table, and a comparison table for choosing between function and plugin approaches. adding_custom_datasets.md: update overview to mention both ingestion paths; update ingestion field reference table with ingestion.plugin and ingestion.params; add IngestionPlugin skeleton and dataset template example at end of doc. built_in_datasets.md: replace stale "Sync behaviour" descriptions with accurate Ingest method + Sync behaviour paragraphs for CHIRPS3 (COG range requests), ERA5-Land (remote zarr, monthly fetch), and WorldPop (per-country GeoTIFF download). Add country_code note to WorldPop. architecture.md: replace single-path data lifecycle diagram with dual-path diagram (function path vs plugin path); add IngestionPlugin to the plugin contract section; update the append execution mode section to distinguish cache-based (function path) from Icechunk-based (plugin path) behaviour. --- docs/adding_custom_datasets.md | 103 ++++++++++++++++++++++++++++++++- docs/architecture.md | 87 +++++++++++++++++++--------- docs/built_in_datasets.md | 16 +++-- docs/extensibility.md | 76 ++++++++++++++++++++++++ 4 files changed, 248 insertions(+), 34 deletions(-) diff --git a/docs/adding_custom_datasets.md b/docs/adding_custom_datasets.md index 0342fade..7e27dc99 100644 --- a/docs/adding_custom_datasets.md +++ b/docs/adding_custom_datasets.md @@ -8,8 +8,10 @@ The built-in dataset templates (CHIRPS3, ERA5-Land, WorldPop) ship as package da Adding a custom dataset involves two things: -1. **A download function** — a Python function that downloads data and writes it as one or more NetCDF files to a given directory. -2. **A dataset template YAML** — a file that describes the dataset and tells the API which download function to call. +1. **An ingestion function or plugin** — either a download function that writes NetCDF files to disk, or an `IngestionPlugin` class that streams data directly into an Icechunk store. +2. **A dataset template YAML** — a file that describes the dataset and tells the API which function or plugin to call. + +Use the **download function** approach for simple sources. Use the **IngestionPlugin** approach for sources that benefit from streaming (COG range requests, remote zarr, resumable long ingests with per-period commits). Both can coexist in the same template during migration. ## Step 1: Write the download function @@ -127,9 +129,13 @@ Omit `sync.availability` entirely for `static` datasets or when you always want | Field | Required | Description | | ----- | -------- | ----------- | -| `ingestion.function` | Yes | Dotted path to the download function | +| `ingestion.plugin` | One of `plugin` or `function` | Dotted path to an `IngestionPlugin` class — preferred for streaming sources | +| `ingestion.params` | No | Constructor keyword arguments forwarded to the plugin class | +| `ingestion.function` | One of `plugin` or `function` | Dotted path to the download function — for simpler file-based sources | | `ingestion.default_params` | No | Extra keyword arguments forwarded to the download function | +Both keys can coexist in the same template. When `ingestion.plugin` is present it is used; `ingestion.function` serves as a fallback for legacy tooling. + **Transforms** — applied after download, before writing to Zarr: ```yaml @@ -225,3 +231,94 @@ The smallest valid template for a static dataset with no sync: ingestion: function: mypackage.sources.my_source.download ``` + +--- + +## Ingestion plugin + +For sources that need streaming access or resumable long ingests, implement an `IngestionPlugin` instead of a download function. The plugin streams data directly into the Icechunk store one period at a time — no intermediate files, no full-rebuild on sync. + +### Plugin skeleton + +```python +# mypackage/sources/my_plugin.py +from __future__ import annotations + +import asyncio +from concurrent.futures import ThreadPoolExecutor +from typing import Any + +import numpy as np +import xarray as xr + +from climate_api.ingest.protocol import GridSpec + +_executor = ThreadPoolExecutor(max_workers=2) + + +class MyPlugin: + max_concurrency = 2 # fetch this many periods in parallel + commit_batch_size = 1 # commit every N periods + + def __init__(self, variable: str) -> None: + self.variable = variable + + async def probe(self, bbox: list[float], **_: Any) -> GridSpec: + """Return grid shape and CRS without downloading data.""" + # Derive shape from known resolution, or open a small metadata request. + xmin, ymin, xmax, ymax = bbox + res = 0.05 # degrees per pixel + import math + nx = max(1, math.ceil((xmax - xmin) / res)) + ny = max(1, math.ceil((ymax - ymin) / res)) + return GridSpec(shape=(ny, nx), crs=4326, dtype=np.dtype("float32"), nodata=-9999.0) + + async def periods(self, start: str, end: str) -> list[str]: + """Return the ordered list of period IDs to fetch.""" + # Return ISO date strings, month strings, year strings, etc. + return ["2024-01-01", "2024-01-02"] # replace with real logic + + async def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: + """Fetch one period. Must return a Dataset with a 'time' dimension.""" + return await asyncio.get_running_loop().run_in_executor( + _executor, self._fetch_sync, period_id, bbox + ) + + def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: + # Blocking I/O in thread pool — download, clip to bbox, return Dataset. + ... +``` + +### Dataset template + +```yaml +- id: my_streaming_dataset + name: My streaming dataset + variable: rainfall + period_type: daily + sync: + kind: temporal + execution: append + extents: + spatial: + bbox: [-180, -50, 180, 50] + temporal: + begin: "2000-01-01" + resolution: P1D + ingestion: + plugin: mypackage.sources.my_plugin.MyPlugin + params: + variable: rainfall + units: mm + resolution: 5 km x 5 km + source: My source +``` + +### Key conventions for `fetch_period` + +- The returned Dataset must have a `time` dimension with exactly the period's time steps as coordinate values. +- Spatial dimensions should be named `x` and `y` (or match `GridSpec.x_dim` / `GridSpec.y_dim`). +- Clear all encoding before returning and pin the time encoding: `ds["time"].encoding.update({"units": "days since 1970-01-01", "dtype": "int32"})`. +- For sources where blocking I/O is unavoidable (rioxarray, requests), run it in a `ThreadPoolExecutor` as shown above. + +See the built-in plugins (`climate_api/ingest/plugins/`) for complete worked examples: `chirps3.py` (COG range requests), `era5_land.py` (remote zarr), and `worldpop.py` (full-file download). diff --git a/docs/architecture.md b/docs/architecture.md index 0f2a8a71..815e5512 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -52,6 +52,10 @@ This is a deliberate design constraint: each instance serves one place. A Sierra ## Data lifecycle +The framework supports two ingestion paths depending on the template's `ingestion` block. + +**Function path** (`ingestion.function`) — for simpler file-based sources: + ``` Template (YAML) │ @@ -69,26 +73,38 @@ Artifact (internal record) │ publish=true ▼ Managed dataset (public API) - ├── /datasets/{id} — native metadata - ├── /zarr/{id} — raw zarr store access - ├── /stac/collections/{id} — STAC discovery + ├── /datasets/{id} — native metadata + ├── /zarr/{id} — raw zarr store access + ├── /stac/collections/{id} — STAC discovery └── /ogcapi/collections/{id} — OGC API access ``` -The ingestion function is called identically by both `POST /ingestions` and `POST /sync` — the framework invokes it the same way regardless of the trigger. A correctly written ingestion function works for both without any changes. +**Plugin path** (`ingestion.plugin`) — for streaming and resumable ingests: -The framework is responsible for everything from "write zarr" onward. An ingestion function only needs to write NetCDF files to a given directory. The framework then: +``` +Template (YAML) + │ + │ POST /ingestions (or POST /sync) + ▼ +Orchestrator + │ probe() → fix chunk shape, write GeoZarr attributes + │ periods() → compare against committed store state + │ for each pending period: + │ fetch_period() → xr.Dataset (in source CRS) + │ to_zarr(icechunk_store, append_dim="time") + │ commit every commit_batch_size periods + │ rechunk in-place (if rechunk_time is set) + │ expire intermediate snapshots + │ register ArtifactFormat.ICECHUNK artifact record + │ + │ publish=true + ▼ +Managed dataset (public API) — same endpoints as above +``` -1. reads and normalises the coordinate names -2. applies transforms (unit conversion, etc.) -3. reprojects to the instance CRS -4. builds the zarr store with auto-computed chunking -5. writes GeoZarr root attributes (`spatial:bbox`, `proj:code`) so map clients can position tiles -6. computes artifact coverage (spatial bounds + time range) from the written data -7. stores the artifact record -8. publishes the managed dataset through pygeoapi if `publish=true` +The plugin path writes directly to an Icechunk store — no intermediate files on disk. A crash leaves the store at the last committed period; restart resumes from there. The store is readable and serveable from the first committed period. -This division means that ingestion functions do not need to know about zarr conventions, STAC, OGC, or pygeoapi. They write data files; the framework handles everything else. +Both paths produce the same public API surface. The `/zarr/{id}` and `/stac/collections/{id}` routes handle both `ZARR` and `ICECHUNK` artifact formats transparently. --- @@ -130,7 +146,7 @@ Before executing a sync, the engine calls the availability function to clamp the ## The plugin contract -The platform has four extension points. Each one has a narrow contract — the framework handles everything else automatically. +The platform has five extension points. Each one has a narrow contract — the framework handles everything else automatically. ### Ingestion function @@ -150,25 +166,40 @@ def download( The function writes NetCDF files. The framework reads them, normalises coordinate names, applies transforms, reprojects to the instance CRS, builds the zarr, writes GeoZarr attributes, computes coverage, and registers the artifact. -The ingestion function is called identically by `POST /ingestions` and `POST /sync`. The caller makes no difference to the function — it always receives the same parameters. - -**Reusing ingestion logic across templates**: multiple YAML templates can reference the same Python function and differentiate via `default_params`. This is the intended pattern for sources that have the same fetching logic but expose different variables: +**Reusing ingestion logic across templates**: multiple YAML templates can reference the same Python function and differentiate via `default_params`: ```yaml -# era5land_temperature_hourly.yaml ingestion: function: dhis2eo.data.era5_land.download default_params: variable: 2m_temperature +``` -# era5land_precipitation_hourly.yaml -ingestion: - function: dhis2eo.data.era5_land.download - default_params: - variable: total_precipitation +### Ingestion plugin + +For sources that need streaming access, concurrent fetching, or resumable long ingests, use `ingestion.plugin` instead of a download function: + +```python +class MyPlugin: + max_concurrency: int = 1 # parallel fetch limit + commit_batch_size: int = 1 # periods per Icechunk commit + + async def probe(self, bbox: list[float], **params) -> GridSpec: + """Metadata-only probe — no data transfer.""" + ... + + async def periods(self, start: str, end: str) -> list[str]: + """Return the ordered list of period IDs available from start to end.""" + ... + + async def fetch_period(self, period_id: str, bbox: list[float], **params) -> xr.Dataset: + """Fetch one period. Return a Dataset with a 'time' dimension in source CRS.""" + ... ``` -No framework changes are needed to support a new variable from the same source. +The orchestrator calls `probe()` once, `periods()` once, then drives a bounded-concurrency fetch loop — writing each period directly to an Icechunk store and committing every `commit_batch_size` periods. Plugins never touch zarr or Icechunk directly. + +See [Extensibility — Ingestion plugins](extensibility.md#ingestion-plugins) for the full protocol and `GridSpec` reference. ### Transform function @@ -284,9 +315,11 @@ Each instance is configured for one place. This keeps the data model simple (no The sync engine validates that new data connects to the end of the existing artifact before appending. If a gap exists, the sync fails rather than silently producing a dataset with a hole. This is a deliberate constraint: downstream consumers (DHIS2, CHAP) depend on continuous time series and should not receive data with silent gaps. -### The append execution mode avoids re-downloading history +### The append execution mode + +For **function-path** datasets, `append` downloads only the missing time range and rebuilds the full zarr from all cached files. The local cache (NetCDF files in `data/downloads/`) is the source of truth; the zarr is a derived view. If the cache is deleted, a rematerialize is required to recover. -`append` downloads only the missing range and rebuilds the full zarr from all cached files. This means the local cache (NetCDF files in `data/downloads/`) is the source of truth for the full time series; the zarr is a derived view. If the cache is deleted, a rematerialize is required to recover. +For **plugin-path** datasets, `append` compares the pending period list against the already-committed time coordinates in the Icechunk store and fetches only the missing periods. The Icechunk store itself is the source of truth — no separate download cache. A crash leaves the store at the last committed period; restart resumes from there without any additional recovery logic. ### Transforms run after download, before reproject diff --git a/docs/built_in_datasets.md b/docs/built_in_datasets.md index bcf0e53f..fd5f488f 100644 --- a/docs/built_in_datasets.md +++ b/docs/built_in_datasets.md @@ -21,7 +21,9 @@ To ingest a built-in dataset for your configured extent, see the [API reference] CHIRPS (Climate Hazards Group InfraRed Precipitation with Station data) v3 is a quasi-global daily precipitation dataset merging satellite thermal infrared imagery with station observations. It is widely used for drought monitoring, food security analysis, and WASH planning in low- and middle-income countries. -**Sync behaviour** — new data is ingested incrementally as it becomes available. CHIRPS has a nominal publication lag of around 3–7 days, so data through yesterday is not always present. The API uses a custom availability function that checks the actual latest available date from the CHIRPS server before each sync. +**Ingest method** — each day is fetched as a Cloud-Optimized GeoTIFF via HTTP range request. Only the configured bbox window is downloaded; full global files are never transferred. Up to four days are fetched concurrently and written directly to the Icechunk store — no intermediate files on disk. + +**Sync behaviour** — new days are appended incrementally. CHIRPS final data lags approximately 1–2 months (exact cutoff: end of the previous month if today is after the 20th, else two months back). Only the missing days are fetched on each sync run. **Transforms** — none applied; data is stored as received in mm. @@ -42,7 +44,9 @@ CHIRPS (Climate Hazards Group InfraRed Precipitation with Station data) v3 is a ERA5-Land is a global atmospheric reanalysis produced by ECMWF. The 2 m temperature variable (`t2m`) represents the air temperature 2 metres above the land surface, including corrections for topography relative to the ERA5 pressure levels. -**Sync behaviour** — new hours are appended incrementally. ERA5-Land is published with a nominal 5-day lag; the API will not request data closer than 120 hours to the current time. +**Ingest method** — the DestinE zarr store is opened lazily over HTTPS. Each monthly period is fetched as an hourly slice, aggregated to daily values (mean), and written directly to the Icechunk store — no intermediate files on disk. The source's 0–360° longitude range is converted to −180–180° before writing. + +**Sync behaviour** — new months are appended incrementally. ERA5-Land is published with a nominal 5-day lag; months closer than 120 hours to today are not requested. **Transforms** — raw values are in Kelvin. The `kelvin_to_celsius` transform is applied at ingest time, so stored values are in °C. @@ -63,7 +67,9 @@ ERA5-Land is a global atmospheric reanalysis produced by ECMWF. The 2 m temperat Total precipitation (`tp`) from ERA5-Land is an accumulated hourly value representing the sum of large-scale and convective precipitation falling onto the land surface. It is useful as a high-resolution complement to CHIRPS for countries outside CHIRPS's 50°N–50°S band, or for sub-daily analysis. -**Sync behaviour** — same 5-day lag as ERA5-Land temperature; hours are appended incrementally. +**Ingest method** — same as ERA5-Land temperature: monthly periods fetched and aggregated, written directly to Icechunk. + +**Sync behaviour** — same 5-day lag as ERA5-Land temperature; months are appended incrementally. **Transforms** — raw values are in metres per hour. The `metres_to_mm` transform converts to mm at ingest time. @@ -85,7 +91,9 @@ Total precipitation (`tp`) from ERA5-Land is an accumulated hourly value represe WorldPop Global2 provides gridded population estimates and projections at 100 m resolution. Each raster year represents estimated residential population counts. Years up to and including the present are backward-modelled estimates; years beyond the present are forward projections. -**Sync behaviour** — population data is released year by year, not as a continuous stream. The API uses a `release`-kind sync that checks each calendar year separately. Future years (projections) are also requestable, since the underlying data covers through 2030. +**Ingest method** — each year is downloaded as a per-country GeoTIFF from WorldPop's HTTP server (typically 50–200 MB per file), clipped to the configured bbox, and written directly to the Icechunk store. The country code must be set in `ingestion.params.country_code` in the dataset template (e.g. `NOR`, `GHA`). + +**Sync behaviour** — population data is released year by year. The API uses a `release`-kind sync that checks each calendar year separately. Future years (projections through 2030) are also requestable. **Transforms** — none applied; values are stored as received (population counts per pixel). diff --git a/docs/extensibility.md b/docs/extensibility.md index f80c3e2d..58ff7610 100644 --- a/docs/extensibility.md +++ b/docs/extensibility.md @@ -8,6 +8,7 @@ The same pattern applies at every extension point: | --------------- | ------------- | --------------- | | [Dataset templates](#dataset-templates) | YAML files | `plugins_dir/datasets/` | | [Ingestion functions](#ingestion-functions) | Python function, dotted path in YAML | any importable path | +| [Ingestion plugins](#ingestion-plugins) | Python class implementing `IngestionPlugin` | any importable path | | [Transform functions](#transform-functions) | Python function, dotted path in YAML | any importable path | | [Processes](#processes) | YAML file + Python function | `plugins_dir/processes/` | @@ -47,6 +48,81 @@ The function must follow the download function contract (see [Adding custom data --- +## Ingestion plugins + +For sources that require streaming, concurrent fetching, or direct-to-store writes without intermediate files, use `ingestion.plugin` instead of `ingestion.function`. The `plugin` field specifies a Python class implementing the `IngestionPlugin` protocol. + +```yaml +ingestion: + plugin: mypackage.sources.MyPlugin + params: + variable: rainfall + stage: final +``` + +Both `ingestion.function` and `ingestion.plugin` can coexist in the same template — the `plugin` path is used when present, the `function` path serves as a fallback for legacy tooling. + +### Plugin protocol + +A plugin implements three focused async methods. The Climate API layer owns the orchestration loop — plugins never write to zarr or Icechunk directly: + +```python +from climate_api.ingest.protocol import GridSpec +import xarray as xr + +class MyPlugin: + max_concurrency: int = 1 # parallel fetch limit + commit_batch_size: int = 1 # periods per Icechunk commit + + async def probe(self, bbox: list[float], **params) -> GridSpec: + """Metadata-only source probe. Returns grid shape, CRS, dtype. No data transfer.""" + ... + + async def periods(self, start: str, end: str) -> list[str]: + """Return the ordered list of available period IDs from start to end.""" + ... + + async def fetch_period(self, period_id: str, bbox: list[float], **params) -> xr.Dataset: + """Fetch one period. Return a dataset with a 'time' dimension in source CRS.""" + ... +``` + +**`GridSpec`** is the return type of `probe()`: + +```python +@dataclass +class GridSpec: + shape: tuple[int, int] # (ny, nx) grid dimensions + crs: int # EPSG code, e.g. 4326 or 32633 + dtype: np.dtype # data type, e.g. np.dtype("float32") + nodata: float | None = None # fill value + time_dim: bool = True # False for static (time-invariant) datasets + extra_dims: dict[str, int] = field(default_factory=dict) # e.g. {"age_group": 20} +``` + +Set `time_dim=False` for static (time-invariant) datasets — the orchestrator issues a single write with no append dimension. + +### What the orchestrator does + +1. Calls `probe()` once to fix the Icechunk store's chunk shape and write GeoZarr attributes. +2. Calls `periods()` once to get the full period list; filters against already-committed time coordinates. +3. Creates all fetch tasks upfront so up to `max_concurrency` fetches are in flight simultaneously. +4. Awaits tasks in chronological order so writes are always sequential. +5. Commits to the Icechunk store after every `commit_batch_size` periods. +6. On restart, resumes from the last committed period — a crash loses at most one uncommitted batch. +7. After all periods are written, runs a rechunk pass if the plugin declares `rechunk_time`, then expires intermediate Icechunk snapshots to prune history. + +### Choosing between function and plugin + +| Approach | When to use | +| -------- | ----------- | +| `ingestion.function` | Simple sources — write one or more NetCDF files to a directory | +| `ingestion.plugin` | Streaming sources, COG range requests, remote zarr, resumable long ingests | + +See [Adding custom datasets](adding_custom_datasets.md#ingestion-plugin) for a worked example. + +--- + ## Transform functions Transforms are functions applied to a dataset after download and before the Zarr store is written. They are declared as a list of dotted paths in the dataset template: From 51b970329d97f12a1bc83b0b550046ab3c677283 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 21:17:59 +0200 Subject: [PATCH 11/80] =?UTF-8?q?refactor:=20remove=20ingestion.function?= =?UTF-8?q?=20=E2=80=94=20ingestion.plugin=20is=20the=20only=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the legacy NetCDF-file-based ingestion.function download path entirely. All datasets now ingest through IngestionPlugin directly into Icechunk stores. - Remove ingestion.function and default_params from YAML dataset configs - Require ingestion.plugin in dataset validation (was optional fallback) - Remove download_dataset, _validate_spatial_coverage, and related routes - Remove country_code param from create_artifact/run_sync (was only used by the legacy download path); inject it automatically from extent.country_code into any plugin whose constructor declares the parameter (WorldPop only) - Update docs (architecture, extensibility, adding_custom_datasets, instance_guide) and AGENTS.md to reflect plugin-only ingestion Co-Authored-By: Claude Sonnet 4.6 --- AGENTS.md | 11 +- climate_api/data/datasets/chirps3.yaml | 1 - climate_api/data/datasets/era5_land.yaml | 6 - climate_api/data/datasets/worldpop.yaml | 6 +- climate_api/data_manager/routes.py | 27 - .../data_manager/services/downloader.py | 136 +---- .../data_registry/services/datasets.py | 8 +- climate_api/ingest/orchestrator.py | 20 +- climate_api/ingestions/routes.py | 2 - climate_api/ingestions/services.py | 147 +---- climate_api/ingestions/sync_engine.py | 2 - climate_api/publications/services.py | 3 +- docs/adding_custom_datasets.md | 69 +-- docs/architecture.md | 68 +-- docs/extensibility.md | 25 +- docs/instance_guide.md | 8 +- tests/test_config.py | 8 +- tests/test_dataset_registry.py | 8 +- tests/test_datasets.py | 509 +----------------- tests/test_datasets_sync.py | 58 +- tests/test_downloader.py | 211 -------- tests/test_ingest_orchestrator.py | 3 +- tests/test_ingest_plugins.py | 2 - uv.lock | 33 ++ 24 files changed, 108 insertions(+), 1263 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index e34e7519..b80e6adb 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -8,7 +8,7 @@ The DHIS2 Climate API is a FastAPI-based REST API that downloads, processes, and Key concepts: -- **Dataset templates** — YAML files in `data/datasets/` describing a data source (variable, period type, download function). These are blueprints. +- **Dataset templates** — YAML files in `data/datasets/` describing a data source (variable, period type, ingestion plugin). These are blueprints. - **Artifacts / managed datasets** — ingested instances of a template for a specific spatial extent and time range. Exposed under `/datasets` and `/zarr/{dataset_id}`. - **Extent** — a single named spatial bounding box configured at instance setup time (`id`, `bbox`, optional `country_code`). Exposed at `GET /extent`. - **GeoZarr stores** — datasets are stored as chunked Zarr v3 archives with GeoZarr spatial attributes. Flat stores for small extents; multiscale pyramids for large ones. Served chunk-by-chunk over HTTP with no specialised server middleware. @@ -44,18 +44,17 @@ The `.env` file is required for `make run` and `make openapi`. Copy `.env.exampl ## Dataset templates -Each YAML in `data/datasets/` defines a dataset template. The `ingestion` block controls download and zarr build behaviour: +Each YAML in `data/datasets/` defines a dataset template. The `ingestion` block specifies the plugin class that streams data directly into the Icechunk store: ```yaml ingestion: - function: dhis2eo.data.worldpop.pop_total.yearly.download - default_params: {} # passed to the download function + plugin: climate_api.ingest.plugins.worldpop.WorldPopPlugin + params: + version: global2 ``` `build_dataset_zarr` in `data_manager/downloader.py` builds a multiscale Zarr pyramid when the spatial dimensions exceed 2048×2048 pixels; otherwise it writes a flat chunked zarr with chunk sizes derived from the dataset's temporal resolution. -The ingestion interface is being redesigned as a plugin protocol (see GitHub issue #64) — the `ingestion.function` convention will be replaced by a three-method async plugin (`probe`, `periods`, `fetch_period`). - ## pygeoapi pygeoapi is mounted at `/ogcapi` as a sub-application. Its config is generated dynamically from published artifacts by `publications/services.py` and written to `data/pygeoapi/pygeoapi-config.yml`. diff --git a/climate_api/data/datasets/chirps3.yaml b/climate_api/data/datasets/chirps3.yaml index 652c814f..076e8784 100644 --- a/climate_api/data/datasets/chirps3.yaml +++ b/climate_api/data/datasets/chirps3.yaml @@ -19,7 +19,6 @@ params: stage: final flavor: rnl - function: dhis2eo.data.chc.chirps3.daily.download units: mm resolution: 5 km x 5 km source: CHIRPS v3 diff --git a/climate_api/data/datasets/era5_land.yaml b/climate_api/data/datasets/era5_land.yaml index e193fe61..00978f4a 100644 --- a/climate_api/data/datasets/era5_land.yaml +++ b/climate_api/data/datasets/era5_land.yaml @@ -19,9 +19,6 @@ plugin: climate_api.ingest.plugins.era5_land.Era5LandPlugin params: variable: t2m - function: dhis2eo.data.destine.era5_land.hourly.download - default_params: - variables: ['t2m'] transforms: - climate_api.transforms.kelvin_to_celsius units: degC @@ -53,9 +50,6 @@ plugin: climate_api.ingest.plugins.era5_land.Era5LandPlugin params: variable: tp - function: dhis2eo.data.destine.era5_land.hourly.download - default_params: - variables: ['tp'] transforms: - climate_api.transforms.metres_to_mm units: mm diff --git a/climate_api/data/datasets/worldpop.yaml b/climate_api/data/datasets/worldpop.yaml index 82c4a517..170780cd 100644 --- a/climate_api/data/datasets/worldpop.yaml +++ b/climate_api/data/datasets/worldpop.yaml @@ -19,11 +19,7 @@ ingestion: plugin: climate_api.ingest.plugins.worldpop.WorldPopPlugin params: - # country_code is required — set the ISO 3166-1 alpha-3 code for your deployment - # e.g. country_code: NOR - version: global2 - function: dhis2eo.data.worldpop.pop_total.yearly.download - default_params: + # country_code is injected automatically from extent.country_code in climate-api.yaml version: global2 units: people resolution: 100m x 100m diff --git a/climate_api/data_manager/routes.py b/climate_api/data_manager/routes.py index ebd6703a..e69bdeef 100644 --- a/climate_api/data_manager/routes.py +++ b/climate_api/data_manager/routes.py @@ -8,33 +8,6 @@ router = APIRouter() -@router.get( - "/{dataset_id}/download", - response_model=dict, - summary="Internal dataset download", - include_in_schema=False, -) -def download_dataset( - dataset_id: str, - start: str, - background_tasks: BackgroundTasks, - end: str | None = None, - overwrite: bool = False, -) -> dict[str, str]: - """Internal low-level cache download route kept for compatibility.""" - dataset = _get_dataset_or_404(dataset_id) - downloader.download_dataset( - dataset, - start=start, - end=end, - bbox=None, - country_code=None, - overwrite=overwrite, - background_tasks=background_tasks, - ) - return {"status": "Downloading data for dataset"} - - @router.get( "/{dataset_id}/build_zarr", response_model=dict, diff --git a/climate_api/data_manager/services/downloader.py b/climate_api/data_manager/services/downloader.py index 276cd7b0..67a340ae 100644 --- a/climate_api/data_manager/services/downloader.py +++ b/climate_api/data_manager/services/downloader.py @@ -1,8 +1,6 @@ -"""Dataset cache: download, store, and optimize raster data as local files.""" +"""Dataset cache: build and optimize raster data as local Zarr stores.""" -import datetime import importlib -import inspect import logging import os import shutil @@ -12,7 +10,6 @@ import xarray as xr import xproj # noqa: F401 # type: ignore[import-untyped] # pyright: ignore[reportUnusedImport] -from fastapi import BackgroundTasks, HTTPException from geozarr_toolkit import MultiscalesConventionMetadata, create_geozarr_attrs from topozarr.coarsen import create_pyramid @@ -36,80 +33,6 @@ def _resolve_download_dir() -> Path: DOWNLOAD_DIR = _resolve_download_dir() -def download_dataset( - dataset: dict[str, Any], - start: str, - end: str | None, - bbox: list[float] | None, - country_code: str | None, - overwrite: bool, - background_tasks: BackgroundTasks | None, -) -> list[Path]: - """Download dataset files and return the NetCDF paths created or modified by this run. - - The download still happens primarily through side effects in the provider function. - This return value is used to identify the concrete files created for this invocation. - When running in the background-task path, the download is deferred and this function - returns an empty list because no files have been created yet. - """ - _validate_spatial_coverage(dataset, bbox if bbox is not None else _bbox_from_env()) - ingestion = dataset["ingestion"] - eo_download_func_path = ingestion["function"] - eo_download_func = _get_dynamic_function(eo_download_func_path) - before_files = {path.resolve(): path.stat().st_mtime_ns for path in get_cache_files(dataset)} - - params = dict(ingestion.get("default_params", {})) - params.update( - { - "start": start, - "end": end or datetime.date.today().isoformat(), - "dirname": DOWNLOAD_DIR, - "prefix": _get_cache_prefix(dataset), - "overwrite": overwrite, - } - ) - - sig = inspect.signature(eo_download_func) - try: - if "bbox" in sig.parameters: - params["bbox"] = _resolve_bbox(bbox=bbox) - if "country_code" in sig.parameters: - resolved_country_code = country_code or os.getenv("COUNTRY_CODE") - if resolved_country_code: - params["country_code"] = resolved_country_code - else: - raise HTTPException( - status_code=400, - detail=( - "Downloading this dataset requires a country code. " - "Provide it through the resolved extent configuration or set COUNTRY_CODE in the environment." - ), - ) - except HTTPException: - raise - except ValueError as exc: - raise HTTPException(status_code=400, detail=str(exc)) from exc - - if background_tasks is not None: - background_tasks.add_task(eo_download_func, **params) - return [] - - try: - eo_download_func(**params) - except HTTPException: - raise - except ValueError as exc: - raise HTTPException(status_code=400, detail=str(exc)) from exc - except Exception as exc: - message = str(exc).strip() or "Unexpected error from upstream data provider" - raise HTTPException(status_code=502, detail=f"Upstream dataset download failed: {message}") from exc - - after_files = [path.resolve() for path in get_cache_files(dataset)] - changed_files = [ - path for path in after_files if path not in before_files or path.stat().st_mtime_ns != before_files[path] - ] - return changed_files - def build_dataset_zarr(dataset: dict[str, Any], *, start: str | None = None, end: str | None = None) -> None: """Collect dataset cache files into one optimised Zarr archive, clipped to request scope.""" @@ -341,39 +264,6 @@ def get_zarr_path(dataset: dict[str, Any]) -> Path | None: return None -def _validate_spatial_coverage(dataset: dict[str, Any], bbox: list[float] | None) -> None: - """Raise HTTP 400 if the request bbox falls outside the dataset's declared extents.""" - extents = dataset.get("extents") - if not extents or bbox is None: - return - spatial = extents.get("spatial") - if not spatial: - return - cov_bbox = spatial.get("bbox") - if not isinstance(cov_bbox, (list, tuple)) or len(cov_bbox) != 4: - return - cov_xmin, cov_ymin, cov_xmax, cov_ymax = cov_bbox - xmin, ymin, xmax, ymax = bbox - if ymin > cov_ymax or ymax < cov_ymin: - raise HTTPException( - status_code=400, - detail=( - f"Dataset '{dataset['id']}' does not cover this extent. " - f"Latitude coverage: {cov_ymin}°–{cov_ymax}°, " - f"requested: {ymin}°–{ymax}°." - ), - ) - if xmin > cov_xmax or xmax < cov_xmin: - raise HTTPException( - status_code=400, - detail=( - f"Dataset '{dataset['id']}' does not cover this extent. " - f"Longitude coverage: {cov_xmin}°–{cov_xmax}°, " - f"requested: {xmin}°–{xmax}°." - ), - ) - - def _get_dynamic_function(full_path: str) -> Callable[..., Any]: """Import and return a function given its dotted module path.""" parts = full_path.split(".") @@ -383,27 +273,3 @@ def _get_dynamic_function(full_path: str) -> Callable[..., Any]: return getattr(module, function_name) # type: ignore[no-any-return] -def _resolve_bbox(*, bbox: list[float] | None) -> list[float]: - """Resolve bbox from request or environment.""" - if bbox is not None: - return bbox - - env_bbox = _bbox_from_env() - if env_bbox is not None: - return env_bbox - - raise ValueError( - "A bbox is required for this dataset. Provide it in the request or set DOWNLOAD_BBOX in the environment." - ) - - -def _bbox_from_env() -> list[float] | None: - """Parse a default bbox from environment if configured.""" - raw_bbox = os.getenv("DOWNLOAD_BBOX") or os.getenv("DEFAULT_DOWNLOAD_BBOX") - if not raw_bbox: - return None - - parts = [part.strip() for part in raw_bbox.split(",")] - if len(parts) != 4: - raise ValueError("DOWNLOAD_BBOX must contain four comma-separated numbers: xmin,ymin,xmax,ymax") - return [float(part) for part in parts] diff --git a/climate_api/data_registry/services/datasets.py b/climate_api/data_registry/services/datasets.py index acc3d51d..b3a1f354 100644 --- a/climate_api/data_registry/services/datasets.py +++ b/climate_api/data_registry/services/datasets.py @@ -152,14 +152,10 @@ def _validate_dataset_template(dataset: object, *, source: str) -> None: ingestion = dataset.get("ingestion") if not isinstance(ingestion, dict): raise ValueError(f"Dataset template '{dataset_id}' in {source} must define an 'ingestion' block") - function = ingestion.get("function") plugin = ingestion.get("plugin") - has_function = isinstance(function, str) and function - has_plugin = isinstance(plugin, str) and plugin - if not has_function and not has_plugin: + if not (isinstance(plugin, str) and plugin): raise ValueError( - f"Dataset template '{dataset_id}' in {source} must define either " - "ingestion.function (legacy download path) or ingestion.plugin (per-period Icechunk ingest)" + f"Dataset template '{dataset_id}' in {source} must define ingestion.plugin" ) sync_availability = sync_block.get("availability") if isinstance(sync_block, dict) else None diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py index 7b4ec08d..e353b78a 100644 --- a/climate_api/ingest/orchestrator.py +++ b/climate_api/ingest/orchestrator.py @@ -14,6 +14,7 @@ import asyncio import importlib +import inspect import logging from collections.abc import Callable from datetime import datetime, timezone @@ -28,19 +29,34 @@ logger = logging.getLogger(__name__) -def load_plugin(dotted_path: str, params: dict[str, Any]) -> IngestionPlugin: +def load_plugin( + dotted_path: str, + params: dict[str, Any], + extra_params: dict[str, Any] | None = None, +) -> IngestionPlugin: """Instantiate an IngestionPlugin from a dotted import path and YAML params. The class is imported from dotted_path and called with **params. Built-in plugins accept variable and other source-specific kwargs; custom plugins define their own __init__ signature. + + extra_params are merged into params only for keys that the constructor + declares and that are not already present in params. This is used to inject + instance-level config (e.g. country_code from the extent) without requiring + every plugin to accept it. """ module_path, _, class_name = dotted_path.rpartition(".") if not module_path: raise ValueError(f"Invalid plugin path '{dotted_path}': must be 'module.ClassName'") module = importlib.import_module(module_path) cls = getattr(module, class_name) - plugin = cls(**params) + merged = dict(params) + if extra_params: + sig = inspect.signature(cls.__init__) + for key, value in extra_params.items(): + if key not in merged and key in sig.parameters: + merged[key] = value + plugin = cls(**merged) if not isinstance(plugin, IngestionPlugin): raise TypeError(f"{dotted_path} does not implement IngestionPlugin") return plugin diff --git a/climate_api/ingestions/routes.py b/climate_api/ingestions/routes.py index b558fe22..3b33ef95 100644 --- a/climate_api/ingestions/routes.py +++ b/climate_api/ingestions/routes.py @@ -30,13 +30,11 @@ def create_ingestion(request: CreateIngestionRequest) -> IngestionResponse: dataset = _get_dataset_or_404(request.dataset_id) extent = get_extent_or_404() resolved_bbox = list(extent["bbox"]) - resolved_country_code = extent.get("country_code") artifact = services.create_artifact( dataset=dataset, start=request.start, end=request.end, bbox=resolved_bbox, - country_code=resolved_country_code, overwrite=request.overwrite, prefer_zarr=request.prefer_zarr, publish=request.publish, diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 0522a405..e2882537 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -151,14 +151,13 @@ def create_artifact( start: str, end: str | None, bbox: list[float] | None, - country_code: str | None, overwrite: bool, prefer_zarr: bool, publish: bool, download_start: str | None = None, download_end: str | None = None, ) -> ArtifactRecord: - """Download a dataset, persist it locally, and store artifact metadata.""" + """Ingest a dataset via its plugin, persist it locally, and store artifact metadata.""" period_type = str(dataset["period_type"]) start = _normalize_request_period(start, period_type=period_type, field_name="start") end = _normalize_optional_request_period(end, period_type=period_type, field_name="end") @@ -172,7 +171,6 @@ def create_artifact( download_start=download_start, download_end=download_end, ) - requires_canonical_zarr = download_start is not None resolved_download_end = download_end if download_end is not None else end if resolved_download_end is None: resolved_download_end = _default_request_end(period_type) @@ -184,7 +182,7 @@ def create_artifact( existing = _find_existing_artifact( dataset_id=str(dataset["id"]), request_scope=request_scope, - prefer_zarr=prefer_zarr or requires_canonical_zarr, + prefer_zarr=prefer_zarr, ) if existing is not None and not overwrite: logger.info( @@ -198,136 +196,15 @@ def create_artifact( return publish_artifact_record(existing.artifact_id) return existing - ingestion = dataset.get("ingestion") or {} - if isinstance(ingestion, dict) and ingestion.get("plugin"): - return _create_icechunk_artifact( - dataset=dataset, - start=start, - end=resolved_download_end, - bbox=bbox, - request_scope=request_scope, - publish=publish, - ingest_start=download_start, - ) - - logger.info( - "Downloading dataset '%s': request_scope=%s..%s download_scope=%s..%s prefer_zarr=%s publish=%s", - dataset["id"], - start, - end, - download_start or start, - resolved_download_end, - prefer_zarr, - publish, - ) - downloaded_files = downloader.download_dataset( - dataset, - start=download_start or start, + return _create_icechunk_artifact( + dataset=dataset, + start=start, end=resolved_download_end, bbox=bbox, - country_code=country_code, - overwrite=overwrite, - background_tasks=None, - ) - logger.info("Download finished for dataset '%s': changed_files=%d", dataset["id"], len(downloaded_files)) - - if prefer_zarr or requires_canonical_zarr: - try: - logger.info("Building canonical Zarr artifact for dataset '%s'", dataset["id"]) - downloader.build_dataset_zarr(dataset, start=start, end=end) - logger.info("Canonical Zarr artifact built for dataset '%s'", dataset["id"]) - except Exception as exc: - if requires_canonical_zarr: - if isinstance(exc, ValueError): - raise HTTPException( - status_code=409, - detail=f"Append sync canonical Zarr rebuild failed for requested scope: {exc}", - ) from exc - raise HTTPException( - status_code=500, - detail="Append sync canonical Zarr rebuild failed unexpectedly.", - ) from exc - # Fall back to NetCDF when Zarr materialization is not viable. - logger.warning( - "Zarr materialization failed for dataset '%s'; falling back to NetCDF", - dataset["id"], - exc_info=True, - ) - - zarr_path = downloader.get_zarr_path(dataset) - if requires_canonical_zarr and zarr_path is None: - raise HTTPException( - status_code=500, - detail="Append sync requires a canonical Zarr artifact, but no Zarr store was produced.", - ) - cache_files = ( - downloader.get_cache_files(dataset) - if requires_canonical_zarr - else downloaded_files or downloader.get_cache_files(dataset) - ) - primary_path: str | None - - if zarr_path is not None: - artifact_format = ArtifactFormat.ZARR - primary_path = str(zarr_path.resolve()) - asset_paths = [primary_path] - elif cache_files: - artifact_format = ArtifactFormat.NETCDF - asset_paths = [str(path.resolve()) for path in cache_files] - primary_path = asset_paths[0] if len(asset_paths) == 1 else None - else: - raise HTTPException(status_code=500, detail="Download finished without any saved artifact files") - - coverage_data = get_data_coverage_for_paths( - dataset, - zarr_path=primary_path if artifact_format == ArtifactFormat.ZARR else None, - netcdf_paths=asset_paths if artifact_format == ArtifactFormat.NETCDF else None, - ) - if not coverage_data.get("has_data", True): - raise HTTPException(status_code=409, detail="Downloaded artifact contains no data for the requested scope") - _spatial_wgs84_data = coverage_data["coverage"].get("spatial_wgs84") - coverage = ArtifactCoverage( - temporal=CoverageTemporal(**coverage_data["coverage"]["temporal"]), - spatial=CoverageSpatial(**coverage_data["coverage"]["spatial"]), - spatial_wgs84=CoverageSpatial(**_spatial_wgs84_data) if _spatial_wgs84_data else None, - ) - if not _temporal_coverage_matches_request_scope(coverage.temporal, request_scope): - raise HTTPException( - status_code=409, - detail=( - "Materialized artifact coverage does not match the requested scope: " - f"coverage={coverage.temporal.start}..{coverage.temporal.end}, " - f"request={request_scope.start}..{request_scope.end}" - ), - ) - - record = ArtifactRecord( - artifact_id=str(uuid4()), - dataset_id=str(dataset["id"]), - dataset_name=str(dataset["name"]), - variable=str(dataset["variable"]), - format=artifact_format, - path=primary_path, - asset_paths=asset_paths, - variables=[str(dataset["variable"])], request_scope=request_scope, - coverage=coverage, - created_at=datetime.now(UTC), - publication=ArtifactPublication(), - ) - stored_record = _store_artifact_record(record, prefer_zarr=prefer_zarr, publish=publish) - logger.info( - "Stored artifact '%s' for dataset '%s': format=%s coverage=%s..%s", - stored_record.artifact_id, - dataset["id"], - stored_record.format, - stored_record.coverage.temporal.start, - stored_record.coverage.temporal.end, + publish=publish, + ingest_start=download_start, ) - if publish and stored_record.publication.status != PublicationStatus.PUBLISHED: - logger.info("Publishing artifact '%s' for dataset '%s'", stored_record.artifact_id, dataset["id"]) - return publish_artifact_record(stored_record.artifact_id) - return stored_record def _create_icechunk_artifact( @@ -362,7 +239,11 @@ def _create_icechunk_artifact( ) store_path = downloader.DOWNLOAD_DIR / f"{dataset_id}.icechunk" - plugin = load_plugin(plugin_path, params) + extent_country_code = extent.get("country_code") if extent else None + extra_params: dict[str, object] = {} + if extent_country_code: + extra_params["country_code"] = extent_country_code + plugin = load_plugin(plugin_path, params, extra_params=extra_params or None) effective_start = ingest_start if ingest_start is not None else start # Rechunk after the initial ingest (when no delta start is provided) using the @@ -522,9 +403,6 @@ def sync_dataset( source_dataset = registry_datasets.get_dataset(latest_artifact.dataset_id) if source_dataset is None: raise HTTPException(status_code=404, detail=f"Source dataset '{latest_artifact.dataset_id}' not found") - extent = get_extent() - resolved_country_code = extent.get("country_code") if extent else None - committed_end: str | None = None if latest_artifact.format == ArtifactFormat.ICECHUNK and latest_artifact.path: from climate_api.ingest.store import read_committed_period_ids @@ -544,7 +422,6 @@ def sync_dataset( latest_artifact=latest_artifact, source_dataset=source_dataset, requested_end=end, - country_code=resolved_country_code, prefer_zarr=prefer_zarr, publish=publish, create_artifact_fn=create_artifact, diff --git a/climate_api/ingestions/sync_engine.py b/climate_api/ingestions/sync_engine.py index 221adfe1..4f64ae82 100644 --- a/climate_api/ingestions/sync_engine.py +++ b/climate_api/ingestions/sync_engine.py @@ -168,7 +168,6 @@ def run_sync( latest_artifact: ArtifactRecord, source_dataset: dict[str, Any], requested_end: str | None, - country_code: str | None, prefer_zarr: bool, publish: bool, create_artifact_fn: Callable[..., ArtifactRecord], @@ -249,7 +248,6 @@ def run_sync( download_start=download_start, download_end=sync_detail.delta_end if download_start is not None else None, bbox=list(latest_artifact.request_scope.bbox) if latest_artifact.request_scope.bbox is not None else None, - country_code=country_code, overwrite=False, prefer_zarr=prefer_zarr, publish=publish, diff --git a/climate_api/publications/services.py b/climate_api/publications/services.py index 34b32639..5637b6ec 100644 --- a/climate_api/publications/services.py +++ b/climate_api/publications/services.py @@ -58,7 +58,8 @@ def publish_artifact(record: ArtifactRecord) -> ArtifactRecord: "collection_id": collection_id, "published_at": datetime.now(UTC), # Pyramid zarr and Icechunk stores are served via the /zarr endpoint, not pygeoapi. - "pygeoapi_path": None if (is_pyramid_zarr or is_icechunk) else f"/ogcapi/collections/{collection_id}", + "pygeoapi_path": None if (is_pyramid_zarr or is_icechunk) + else f"/ogcapi/collections/{collection_id}", } ) } diff --git a/docs/adding_custom_datasets.md b/docs/adding_custom_datasets.md index 7e27dc99..3804987b 100644 --- a/docs/adding_custom_datasets.md +++ b/docs/adding_custom_datasets.md @@ -8,61 +8,10 @@ The built-in dataset templates (CHIRPS3, ERA5-Land, WorldPop) ship as package da Adding a custom dataset involves two things: -1. **An ingestion function or plugin** — either a download function that writes NetCDF files to disk, or an `IngestionPlugin` class that streams data directly into an Icechunk store. -2. **A dataset template YAML** — a file that describes the dataset and tells the API which function or plugin to call. +1. **An `IngestionPlugin` class** — streams data directly into an Icechunk store one period at a time. +2. **A dataset template YAML** — a file that describes the dataset and tells the API which plugin to call. -Use the **download function** approach for simple sources. Use the **IngestionPlugin** approach for sources that benefit from streaming (COG range requests, remote zarr, resumable long ingests with per-period commits). Both can coexist in the same template during migration. - -## Step 1: Write the download function - -The download function must be importable as a dotted Python path. The API calls it with keyword arguments and ignores the return value — the function is expected to write NetCDF files to `dirname` using `prefix` as the filename prefix. - -```python -# mypackage/sources/enacts.py -from pathlib import Path - -def download( - *, - start: str, # ISO 8601 date or datetime - end: str, - dirname: Path, # directory to write output files into - prefix: str, # filename prefix (use e.g. f"{prefix}_{year}.nc") - overwrite: bool, - bbox: list[float], # [xmin, ymin, xmax, ymax] — include only if your source needs it - **kwargs: object, # absorbs default_params from the YAML template -) -> None: - """Download ENACTS rainfall and write NetCDF files to dirname.""" - ... -``` - -**Required parameters** — always passed by the API: - -| Parameter | Type | Description | -| ----------- | ---------- | ----------- | -| `start` | `str` | Start of the requested time range (ISO 8601) | -| `end` | `str` | End of the requested time range (ISO 8601) | -| `dirname` | `Path` | Directory to write output NetCDF files into | -| `prefix` | `str` | Filename prefix for output files | -| `overwrite` | `bool` | Whether to overwrite existing cached files | - -**Optional parameters** — passed only when present in the function signature: - -| Parameter | Type | Description | -| -------------- | --------------- | ----------- | -| `bbox` | `list[float]` | Bounding box as `[xmin, ymin, xmax, ymax]` — include this if your source requires a spatial filter | -| `country_code` | `str` | ISO 3166-1 alpha-3 code — include this if your source (e.g. WorldPop) requires a country code | - -Any extra keyword arguments from `ingestion.default_params` in the YAML template are forwarded as additional kwargs. - -The API normalises coordinate names at write time: `valid_time` → `time`, `lat`/`latitude` → `y`, `lon`/`longitude` → `x`. Using the canonical names in your output avoids any ambiguity, but upstream names are handled automatically. - -Install your package in the same environment as the Climate API: - -```bash -pip install ./mypackage -``` - -## Step 2: Create a dataset template YAML +## Step 1: Create a dataset template YAML Create a directory for your custom templates and add a YAML file. Each file contains a list of templates (even if there is only one): @@ -77,7 +26,9 @@ Create a directory for your custom templates and add a YAML file. Each file cont kind: temporal execution: append ingestion: - function: mypackage.sources.enacts.download + plugin: mypackage.sources.EnactsPlugin + params: + variable: rainfall units: mm resolution: 4 km x 4 km source: ENACTS @@ -129,12 +80,8 @@ Omit `sync.availability` entirely for `static` datasets or when you always want | Field | Required | Description | | ----- | -------- | ----------- | -| `ingestion.plugin` | One of `plugin` or `function` | Dotted path to an `IngestionPlugin` class — preferred for streaming sources | +| `ingestion.plugin` | Yes | Dotted path to an `IngestionPlugin` class | | `ingestion.params` | No | Constructor keyword arguments forwarded to the plugin class | -| `ingestion.function` | One of `plugin` or `function` | Dotted path to the download function — for simpler file-based sources | -| `ingestion.default_params` | No | Extra keyword arguments forwarded to the download function | - -Both keys can coexist in the same template. When `ingestion.plugin` is present it is used; `ingestion.function` serves as a fallback for legacy tooling. **Transforms** — applied after download, before writing to Zarr: @@ -229,7 +176,7 @@ The smallest valid template for a static dataset with no sync: sync: kind: static ingestion: - function: mypackage.sources.my_source.download + plugin: mypackage.sources.my_plugin.MyPlugin ``` --- diff --git a/docs/architecture.md b/docs/architecture.md index 815e5512..78dd2411 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -16,7 +16,7 @@ A template defines: - the dataset identifier and display metadata - the variable name, units, and period type -- how to download the data (`ingestion.function`) +- how to ingest the data (`ingestion.plugin`) - what transforms to apply (`transforms`) - what sync strategy to use (`sync.kind`, `sync.execution`) @@ -52,34 +52,9 @@ This is a deliberate design constraint: each instance serves one place. A Sierra ## Data lifecycle -The framework supports two ingestion paths depending on the template's `ingestion` block. +All datasets are ingested through the plugin path (`ingestion.plugin`): -**Function path** (`ingestion.function`) — for simpler file-based sources: - -``` -Template (YAML) - │ - │ POST /ingestions (or POST /sync) - ▼ -Ingestion - │ call ingestion function → NetCDF files on disk - │ apply transforms - │ reproject to instance CRS - │ write GeoZarr store - │ compute coverage (spatial + temporal extent of actual data) - ▼ -Artifact (internal record) - │ - │ publish=true - ▼ -Managed dataset (public API) - ├── /datasets/{id} — native metadata - ├── /zarr/{id} — raw zarr store access - ├── /stac/collections/{id} — STAC discovery - └── /ogcapi/collections/{id} — OGC API access -``` - -**Plugin path** (`ingestion.plugin`) — for streaming and resumable ingests: +**Plugin path** (`ingestion.plugin`) — streams data directly into an Icechunk store: ``` Template (YAML) @@ -102,9 +77,7 @@ Orchestrator Managed dataset (public API) — same endpoints as above ``` -The plugin path writes directly to an Icechunk store — no intermediate files on disk. A crash leaves the store at the last committed period; restart resumes from there. The store is readable and serveable from the first committed period. - -Both paths produce the same public API surface. The `/zarr/{id}` and `/stac/collections/{id}` routes handle both `ZARR` and `ICECHUNK` artifact formats transparently. +All ingest writes go directly to an Icechunk store — no intermediate files on disk. A crash leaves the store at the last committed period; restart resumes from there. The store is readable and serveable from the first committed period. --- @@ -146,39 +119,10 @@ Before executing a sync, the engine calls the availability function to clamp the ## The plugin contract -The platform has five extension points. Each one has a narrow contract — the framework handles everything else automatically. - -### Ingestion function - -```python -def download( - *, - start: str, # ISO 8601 date or datetime - end: str, - dirname: Path, # write output files here - prefix: str, # use as filename prefix, e.g. f"{prefix}_{year}.nc" - overwrite: bool, - bbox: list[float], # optional — only if the source needs a spatial filter - **kwargs, # default_params from the YAML template -) -> None: - # Write one or more NetCDF files to dirname. -``` - -The function writes NetCDF files. The framework reads them, normalises coordinate names, applies transforms, reprojects to the instance CRS, builds the zarr, writes GeoZarr attributes, computes coverage, and registers the artifact. - -**Reusing ingestion logic across templates**: multiple YAML templates can reference the same Python function and differentiate via `default_params`: - -```yaml -ingestion: - function: dhis2eo.data.era5_land.download - default_params: - variable: 2m_temperature -``` +The platform has four extension points. Each one has a narrow contract — the framework handles everything else automatically. ### Ingestion plugin -For sources that need streaming access, concurrent fetching, or resumable long ingests, use `ingestion.plugin` instead of a download function: - ```python class MyPlugin: max_concurrency: int = 1 # parallel fetch limit @@ -210,7 +154,7 @@ def my_transform(ds: xr.Dataset, dataset: dict) -> xr.Dataset: # Do not modify dataset-level ds.attrs — the framework manages those. ``` -Transforms are applied in order after the ingestion function returns, before the zarr is written. They receive the full xarray Dataset and the template dict. They return a modified Dataset. They do not write to disk. +Transforms are applied in order after each period is fetched, before the data is written to the Icechunk store. They receive the full xarray Dataset and the template dict. They return a modified Dataset. They do not write to disk. ### Process execution function diff --git a/docs/extensibility.md b/docs/extensibility.md index 58ff7610..f6d6a621 100644 --- a/docs/extensibility.md +++ b/docs/extensibility.md @@ -7,7 +7,6 @@ The same pattern applies at every extension point: | Extension point | How to extend | Plugin location | | --------------- | ------------- | --------------- | | [Dataset templates](#dataset-templates) | YAML files | `plugins_dir/datasets/` | -| [Ingestion functions](#ingestion-functions) | Python function, dotted path in YAML | any importable path | | [Ingestion plugins](#ingestion-plugins) | Python class implementing `IngestionPlugin` | any importable path | | [Transform functions](#transform-functions) | Python function, dotted path in YAML | any importable path | | [Processes](#processes) | YAML file + Python function | `plugins_dir/processes/` | @@ -35,22 +34,9 @@ See [Adding custom datasets](adding_custom_datasets.md) for the full template fi --- -## Ingestion functions - -The `ingestion.function` field in a dataset template is a dotted Python path to the download function that fetches data for that dataset. - -```yaml -ingestion: - function: mypackage.sources.enacts.download -``` - -The function must follow the download function contract (see [Adding custom datasets](adding_custom_datasets.md#step-1-write-the-download-function)). It can live anywhere that is importable — either an installed package or a module placed directly under `plugins_dir` (which is automatically added to `sys.path`). - ---- - ## Ingestion plugins -For sources that require streaming, concurrent fetching, or direct-to-store writes without intermediate files, use `ingestion.plugin` instead of `ingestion.function`. The `plugin` field specifies a Python class implementing the `IngestionPlugin` protocol. +The `ingestion.plugin` field in a dataset template is a dotted Python path to an `IngestionPlugin` class. The plugin streams data directly into the Icechunk store one period at a time — no intermediate files, resumable on restart. ```yaml ingestion: @@ -60,8 +46,6 @@ ingestion: stage: final ``` -Both `ingestion.function` and `ingestion.plugin` can coexist in the same template — the `plugin` path is used when present, the `function` path serves as a fallback for legacy tooling. - ### Plugin protocol A plugin implements three focused async methods. The Climate API layer owns the orchestration loop — plugins never write to zarr or Icechunk directly: @@ -112,13 +96,6 @@ Set `time_dim=False` for static (time-invariant) datasets — the orchestrator i 6. On restart, resumes from the last committed period — a crash loses at most one uncommitted batch. 7. After all periods are written, runs a rechunk pass if the plugin declares `rechunk_time`, then expires intermediate Icechunk snapshots to prune history. -### Choosing between function and plugin - -| Approach | When to use | -| -------- | ----------- | -| `ingestion.function` | Simple sources — write one or more NetCDF files to a directory | -| `ingestion.plugin` | Streaming sources, COG range requests, remote zarr, resumable long ingests | - See [Adding custom datasets](adding_custom_datasets.md#ingestion-plugin) for a worked example. --- diff --git a/docs/instance_guide.md b/docs/instance_guide.md index 765696d8..3d99ca34 100644 --- a/docs/instance_guide.md +++ b/docs/instance_guide.md @@ -29,7 +29,7 @@ my-climate-service/ ├── .gitignore ├── plugins/ │ ├── datasets/ # custom dataset template YAMLs -│ ├── / # custom download / ingestion functions +│ ├── / # custom ingestion plugin modules │ │ ├── __init__.py │ │ └── daily.py │ ├── transforms/ # custom transform functions @@ -160,7 +160,7 @@ Visit `http://localhost:8000` to confirm the API is running. The `/extent` endpo ## Adding plugins -Plugins extend the instance with custom datasets, download functions, transforms, and processes. They live in `plugins_dir` and are loaded automatically at startup. The `plugins_dir` is added to `sys.path`, so Python modules placed directly inside it are importable. +Plugins extend the instance with custom datasets, ingestion plugins, transforms, and processes. They live in `plugins_dir` and are loaded automatically at startup. The `plugins_dir` is added to `sys.path`, so Python modules placed directly inside it are importable. ``` plugins/ @@ -168,7 +168,7 @@ plugins/ │ └── enacts_rainfall.yaml # custom dataset template ├── enacts/ │ ├── __init__.py -│ └── daily.py # download function referenced in the YAML +│ └── plugin.py # IngestionPlugin class referenced in the YAML ├── transforms/ │ ├── __init__.py │ └── enacts.py # transform function @@ -177,7 +177,7 @@ plugins/ └── spatial_stats.py ``` -See [Extensibility](extensibility.md) for the full specification of each extension point, and [Adding custom datasets](adding_custom_datasets.md) for the dataset template field reference and download function contract. +See [Extensibility](extensibility.md) for the full specification of each extension point, and [Adding custom datasets](adding_custom_datasets.md) for the dataset template field reference and plugin contract. --- diff --git a/tests/test_config.py b/tests/test_config.py index 9427abf6..23223dd5 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -171,7 +171,7 @@ def test_plugins_dir_adds_root_to_sys_path_and_makes_modules_importable( sync: kind: static ingestion: - function: myplugin.source.download + plugin: myplugin.source.MyPlugin """, encoding="utf-8", ) @@ -205,7 +205,7 @@ def test_plugins_dir_in_config_adds_to_bundled(monkeypatch: pytest.MonkeyPatch, sync: kind: static ingestion: - function: mypackage.sources.download + plugin: mypackage.sources.MyPlugin """, encoding="utf-8", ) @@ -238,7 +238,7 @@ def test_plugins_dir_resolved_relative_to_config_file(monkeypatch: pytest.Monkey sync: kind: static ingestion: - function: mypackage.sources.download + plugin: mypackage.sources.MyPlugin """, encoding="utf-8", ) @@ -264,7 +264,7 @@ def test_plugins_dir_in_config_overrides_bundled_by_id(monkeypatch: pytest.Monke sync: kind: static ingestion: - function: mypackage.sources.download + plugin: mypackage.sources.MyPlugin """, encoding="utf-8", ) diff --git a/tests/test_dataset_registry.py b/tests/test_dataset_registry.py index 9d44d290..2bd4bbf7 100644 --- a/tests/test_dataset_registry.py +++ b/tests/test_dataset_registry.py @@ -58,7 +58,7 @@ def test_dataset_registry_accepts_supported_sync_kind( sync: kind: temporal ingestion: - function: some.download.function + plugin: some.ingest.Plugin """, encoding="utf-8", ) @@ -129,7 +129,7 @@ def test_dataset_registry_accepts_supported_sync_execution( kind: temporal execution: append ingestion: - function: some.download.function + plugin: some.ingest.Plugin """, encoding="utf-8", ) @@ -154,7 +154,7 @@ def test_dataset_registry_rejects_invalid_sync_availability_function( availability: latest_available_function: 42 ingestion: - function: some.download.function + plugin: some.ingest.Plugin """, encoding="utf-8", ) @@ -180,7 +180,7 @@ def test_dataset_registry_accepts_sync_availability_function( availability: latest_available_function: climate_api.providers.availability.lagged_latest_available ingestion: - function: some.download.function + plugin: some.ingest.Plugin """, encoding="utf-8", ) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index a68e55cd..f036bdc3 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,4 +1,4 @@ -from datetime import UTC, datetime, tzinfo +from datetime import UTC, datetime from pathlib import Path import pytest @@ -287,301 +287,6 @@ def test_temporal_coverage_matches_request_scope_allows_open_ended_reuse() -> No ) -def test_create_artifact_computes_coverage_from_created_artifact_paths( - monkeypatch: pytest.MonkeyPatch, tmp_path: Path -) -> None: - dataset: dict[str, object] = { - "id": "worldpop_population_yearly", - "name": "Total population (WorldPop Global12)", - "variable": "pop_total", - "period_type": "yearly", - } - created_file = tmp_path / "worldpop_population_yearly_2020.nc" - created_file.write_text("dummy", encoding="utf-8") - - monkeypatch.setattr(services.downloader, "download_dataset", lambda *_, **__: [created_file]) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: None) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) - - captured: dict[str, object] = {} - - def fake_get_data_coverage_for_paths( - dataset_arg: dict[str, object], - *, - zarr_path: str | None = None, - netcdf_paths: list[str] | None = None, - ) -> dict[str, object]: - captured["dataset_id"] = dataset_arg["id"] - captured["zarr_path"] = zarr_path - captured["netcdf_paths"] = netcdf_paths - return { - "coverage": { - "temporal": {"start": "2020", "end": "2020"}, - "spatial": {"xmin": -13.3, "ymin": 6.9, "xmax": -10.2, "ymax": 10.0}, - } - } - - monkeypatch.setattr(services, "get_data_coverage_for_paths", fake_get_data_coverage_for_paths) - monkeypatch.setattr(services, "_store_artifact_record", lambda record, **_: record) - - artifact = services.create_artifact( - dataset=dataset, - start="2020", - end="2020", - bbox=[-13.5, 6.9, -10.1, 10.0], - country_code="SLE", - overwrite=False, - prefer_zarr=False, - publish=False, - ) - - assert captured["dataset_id"] == "worldpop_population_yearly" - assert captured["zarr_path"] is None - assert captured["netcdf_paths"] == [str(created_file.resolve())] - assert artifact.coverage.temporal.start == "2020" - assert artifact.coverage.temporal.end == "2020" - - -def test_create_artifact_normalizes_request_scope_to_dataset_period( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - dataset: dict[str, object] = { - "id": "era5land_temperature_hourly", - "name": "2m temperature (ERA5-Land)", - "variable": "t2m", - "period_type": "hourly", - } - created_file = tmp_path / "era5land_temperature_hourly_2026-04-21.nc" - created_file.write_text("dummy", encoding="utf-8") - - captured_download: dict[str, object] = {} - - def fake_download_dataset( - dataset_arg: dict[str, object], - *, - start: str, - end: str | None, - **_: object, - ) -> list[Path]: - captured_download["dataset_id"] = dataset_arg["id"] - captured_download["start"] = start - captured_download["end"] = end - return [created_file] - - monkeypatch.setattr(services.downloader, "download_dataset", fake_download_dataset) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: None) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) - monkeypatch.setattr( - services, - "get_data_coverage_for_paths", - lambda *_, **__: { - "coverage": { - "temporal": {"start": "2026-04-21T12", "end": "2026-04-21T13"}, - "spatial": {"xmin": 1.0, "ymin": 2.0, "xmax": 3.0, "ymax": 4.0}, - } - }, - ) - monkeypatch.setattr(services, "_store_artifact_record", lambda record, **_: record) - - artifact = services.create_artifact( - dataset=dataset, - start="2026-04-21T12:15:00", - end="2026-04-21T13:45:00", - bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, - overwrite=False, - prefer_zarr=False, - publish=False, - ) - - assert captured_download == { - "dataset_id": "era5land_temperature_hourly", - "start": "2026-04-21T12", - "end": "2026-04-21T13", - } - assert artifact.request_scope.start == "2026-04-21T12" - assert artifact.request_scope.end == "2026-04-21T13" - - -def test_create_artifact_defaults_omitted_end_to_dataset_native_period_for_download( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - dataset: dict[str, object] = { - "id": "era5land_temperature_hourly", - "name": "2m temperature (ERA5-Land)", - "variable": "t2m", - "period_type": "hourly", - } - created_file = tmp_path / "era5land_temperature_hourly_2026-04-21.nc" - created_file.write_text("dummy", encoding="utf-8") - - captured_download: dict[str, object] = {} - - class FixedDateTime(datetime): - @classmethod - def now(cls, tz: tzinfo | None = None) -> "FixedDateTime": - return cls(2026, 4, 21, 13, 47, 31, tzinfo=tz if tz is UTC else None) - - def fake_download_dataset( - dataset_arg: dict[str, object], - *, - start: str, - end: str | None, - **_: object, - ) -> list[Path]: - captured_download["dataset_id"] = dataset_arg["id"] - captured_download["start"] = start - captured_download["end"] = end - return [created_file] - - monkeypatch.setattr(services, "utc_now", lambda: FixedDateTime(2026, 4, 21, 13, 47, 31, tzinfo=UTC)) - monkeypatch.setattr(services.downloader, "download_dataset", fake_download_dataset) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: None) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) - monkeypatch.setattr( - services, - "get_data_coverage_for_paths", - lambda *_, **__: { - "coverage": { - "temporal": {"start": "2026-04-21T12", "end": "2026-04-21T13"}, - "spatial": {"xmin": 1.0, "ymin": 2.0, "xmax": 3.0, "ymax": 4.0}, - } - }, - ) - monkeypatch.setattr(services, "_store_artifact_record", lambda record, **_: record) - - artifact = services.create_artifact( - dataset=dataset, - start="2026-04-21T12:15:00", - end=None, - bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, - overwrite=False, - prefer_zarr=False, - publish=False, - ) - - assert captured_download == { - "dataset_id": "era5land_temperature_hourly", - "start": "2026-04-21T12", - "end": "2026-04-21T13", - } - assert artifact.request_scope.end is None - - -def test_create_artifact_returns_409_when_downloaded_artifact_has_no_data( - monkeypatch: pytest.MonkeyPatch, tmp_path: Path -) -> None: - dataset: dict[str, object] = { - "id": "worldpop_population_yearly", - "name": "Total population (WorldPop Global12)", - "variable": "pop_total", - "period_type": "yearly", - } - created_file = tmp_path / "worldpop_population_yearly_2020.nc" - created_file.write_text("dummy", encoding="utf-8") - - monkeypatch.setattr(services.downloader, "download_dataset", lambda *_, **__: [created_file]) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: None) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) - monkeypatch.setattr( - services, - "get_data_coverage_for_paths", - lambda *_, **__: { - "has_data": False, - "coverage": { - "temporal": {"start": None, "end": None}, - "spatial": {"xmin": None, "ymin": None, "xmax": None, "ymax": None}, - }, - }, - ) - - with pytest.raises(services.HTTPException) as exc_info: - services.create_artifact( - dataset=dataset, - start="2020", - end="2020", - bbox=[-13.5, 6.9, -10.1, 10.0], - country_code="SLE", - overwrite=False, - prefer_zarr=False, - publish=False, - ) - - assert exc_info.value.status_code == 409 - assert exc_info.value.detail == "Downloaded artifact contains no data for the requested scope" - - -def test_create_artifact_can_download_delta_while_recording_full_request_scope( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - dataset: dict[str, object] = { - "id": "chirps3_precipitation_daily", - "name": "Total precipitation (CHIRPS3)", - "variable": "precip", - "period_type": "daily", - } - created_file = tmp_path / "chirps3_precipitation_daily_2026-02-01_2026-02-10.nc" - created_file.write_text("dummy", encoding="utf-8") - - captured_download: dict[str, object] = {} - - def fake_download_dataset( - dataset_arg: dict[str, object], - *, - start: str, - end: str | None, - **_: object, - ) -> list[Path]: - captured_download["dataset_id"] = dataset_arg["id"] - captured_download["start"] = start - captured_download["end"] = end - return [created_file] - - monkeypatch.setattr(services.downloader, "download_dataset", fake_download_dataset) - monkeypatch.setattr(services.downloader, "build_dataset_zarr", lambda *_, **__: None) - zarr_path_chirps = tmp_path / "chirps3_precipitation_daily.zarr" - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: zarr_path_chirps) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) - monkeypatch.setattr( - services, - "get_data_coverage_for_paths", - lambda *_, **__: { - "coverage": { - "temporal": {"start": "2026-01-01", "end": "2026-02-10"}, - "spatial": {"xmin": 1.0, "ymin": 2.0, "xmax": 3.0, "ymax": 4.0}, - } - }, - ) - monkeypatch.setattr(services, "_store_artifact_record", lambda record, **_: record) - - artifact = services.create_artifact( - dataset=dataset, - start="2026-01-01", - end="2026-02-10", - download_start="2026-02-01", - download_end="2026-02-10", - bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, - overwrite=False, - prefer_zarr=True, - publish=False, - ) - - assert captured_download == { - "dataset_id": "chirps3_precipitation_daily", - "start": "2026-02-01", - "end": "2026-02-10", - } - assert artifact.request_scope.start == "2026-01-01" - assert artifact.request_scope.end == "2026-02-10" - assert artifact.coverage.temporal.start == "2026-01-01" - assert artifact.coverage.temporal.end == "2026-02-10" - - def test_create_artifact_rejects_partial_download_scope(monkeypatch: pytest.MonkeyPatch) -> None: dataset: dict[str, object] = { "id": "chirps3_precipitation_daily", @@ -598,7 +303,6 @@ def test_create_artifact_rejects_partial_download_scope(monkeypatch: pytest.Monk download_start=None, download_end="2026-02-10", bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, overwrite=False, prefer_zarr=True, publish=False, @@ -626,7 +330,6 @@ def test_create_artifact_rejects_download_scope_outside_request_scope(monkeypatc download_start="2026-02-01", download_end="2026-02-11", bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, overwrite=False, prefer_zarr=True, publish=False, @@ -636,216 +339,6 @@ def test_create_artifact_rejects_download_scope_outside_request_scope(monkeypatc assert "download_end must be less than or equal to end" in str(exc_info.value.detail) -def test_create_artifact_delta_does_not_reuse_netcdf_artifact_when_canonical_zarr_is_required( - monkeypatch: pytest.MonkeyPatch, tmp_path: Path -) -> None: - dataset: dict[str, object] = { - "id": "chirps3_precipitation_daily", - "name": "Total precipitation (CHIRPS3)", - "variable": "precip", - "period_type": "daily", - } - created_file = tmp_path / "chirps3_precipitation_daily_2026-02-01_2026-02-10.nc" - created_file.write_text("dummy", encoding="utf-8") - zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" - - netcdf_existing = _artifact(artifact_id="existing", end="2026-02-10") - netcdf_existing.format = ArtifactFormat.NETCDF - netcdf_existing.path = str(created_file) - netcdf_existing.asset_paths = [str(created_file)] - netcdf_existing.request_scope = ArtifactRequestScope( - start="2026-01-01", - end="2026-02-10", - bbox=(1.0, 2.0, 3.0, 4.0), - ) - - lookup_preferences: list[bool] = [] - - def fake_find_existing_artifact(**kwargs: object) -> ArtifactRecord | None: - lookup_preferences.append(bool(kwargs["prefer_zarr"])) - return None if kwargs["prefer_zarr"] else netcdf_existing - - monkeypatch.setattr(services, "_find_existing_artifact", fake_find_existing_artifact) - monkeypatch.setattr(services.downloader, "download_dataset", lambda *_, **__: [created_file]) - monkeypatch.setattr(services.downloader, "build_dataset_zarr", lambda *_, **__: None) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: zarr_path) - monkeypatch.setattr(services.downloader, "get_cache_files", lambda _: [created_file]) - monkeypatch.setattr( - services, - "get_data_coverage_for_paths", - lambda *_, **__: { - "coverage": { - "temporal": {"start": "2026-01-01", "end": "2026-02-10"}, - "spatial": {"xmin": 1.0, "ymin": 2.0, "xmax": 3.0, "ymax": 4.0}, - } - }, - ) - monkeypatch.setattr(services, "_store_artifact_record", lambda record, **_: record) - - artifact = services.create_artifact( - dataset=dataset, - start="2026-01-01", - end="2026-02-10", - download_start="2026-02-01", - download_end="2026-02-10", - bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, - overwrite=False, - prefer_zarr=False, - publish=False, - ) - - assert lookup_preferences == [True] - assert artifact.format == ArtifactFormat.ZARR - - -def test_create_artifact_delta_requires_canonical_zarr_when_prefer_zarr_is_false( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - dataset: dict[str, object] = { - "id": "chirps3_precipitation_daily", - "name": "Total precipitation (CHIRPS3)", - "variable": "precip", - "period_type": "daily", - } - created_file = tmp_path / "chirps3_precipitation_daily_2026-02-01_2026-02-10.nc" - created_file.write_text("dummy", encoding="utf-8") - zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" - - captured_build: dict[str, object] = {} - - def fake_build_dataset_zarr(dataset_arg: dict[str, object], *, start: str | None, end: str | None) -> None: - captured_build["dataset_id"] = dataset_arg["id"] - captured_build["start"] = start - captured_build["end"] = end - - monkeypatch.setattr(services.downloader, "download_dataset", lambda *_, **__: [created_file]) - monkeypatch.setattr(services.downloader, "build_dataset_zarr", fake_build_dataset_zarr) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: zarr_path) - monkeypatch.setattr(services.downloader, "get_cache_files", lambda _: [created_file]) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) - monkeypatch.setattr( - services, - "get_data_coverage_for_paths", - lambda *_, **__: { - "coverage": { - "temporal": {"start": "2026-01-01", "end": "2026-02-10"}, - "spatial": {"xmin": 1.0, "ymin": 2.0, "xmax": 3.0, "ymax": 4.0}, - } - }, - ) - monkeypatch.setattr(services, "_store_artifact_record", lambda record, **_: record) - - artifact = services.create_artifact( - dataset=dataset, - start="2026-01-01", - end="2026-02-10", - download_start="2026-02-01", - download_end="2026-02-10", - bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, - overwrite=False, - prefer_zarr=False, - publish=False, - ) - - assert captured_build == { - "dataset_id": "chirps3_precipitation_daily", - "start": "2026-01-01", - "end": "2026-02-10", - } - assert artifact.format == ArtifactFormat.ZARR - - -def test_create_artifact_delta_fails_when_canonical_zarr_build_fails( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - dataset: dict[str, object] = { - "id": "chirps3_precipitation_daily", - "name": "Total precipitation (CHIRPS3)", - "variable": "precip", - "period_type": "daily", - } - created_file = tmp_path / "chirps3_precipitation_daily_2026-02-01_2026-02-10.nc" - created_file.write_text("dummy", encoding="utf-8") - - def fail_build_dataset_zarr(*_: object, **__: object) -> None: - raise ValueError("zarr failed") - - monkeypatch.setattr(services.downloader, "download_dataset", lambda *_, **__: [created_file]) - monkeypatch.setattr(services.downloader, "build_dataset_zarr", fail_build_dataset_zarr) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: None) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) - - with pytest.raises(services.HTTPException) as exc_info: - services.create_artifact( - dataset=dataset, - start="2026-01-01", - end="2026-02-10", - download_start="2026-02-01", - download_end="2026-02-10", - bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, - overwrite=False, - prefer_zarr=True, - publish=False, - ) - - assert exc_info.value.status_code == 409 - assert "Append sync canonical Zarr rebuild failed for requested scope: zarr failed" in str(exc_info.value.detail) - - -def test_create_artifact_delta_rejects_short_rebuilt_coverage( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - dataset: dict[str, object] = { - "id": "chirps3_precipitation_daily", - "name": "Total precipitation (CHIRPS3)", - "variable": "precip", - "period_type": "daily", - } - created_file = tmp_path / "chirps3_precipitation_daily_2026-02-01_2026-02-10.nc" - created_file.write_text("dummy", encoding="utf-8") - zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" - - monkeypatch.setattr(services.downloader, "download_dataset", lambda *_, **__: [created_file]) - monkeypatch.setattr(services.downloader, "build_dataset_zarr", lambda *_, **__: None) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: zarr_path) - monkeypatch.setattr(services.downloader, "get_cache_files", lambda _: [created_file]) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) - monkeypatch.setattr( - services, - "get_data_coverage_for_paths", - lambda *_, **__: { - "coverage": { - "temporal": {"start": "2026-02-01", "end": "2026-02-10"}, - "spatial": {"xmin": 1.0, "ymin": 2.0, "xmax": 3.0, "ymax": 4.0}, - } - }, - ) - - with pytest.raises(services.HTTPException) as exc_info: - services.create_artifact( - dataset=dataset, - start="2026-01-01", - end="2026-02-10", - download_start="2026-02-01", - download_end="2026-02-10", - bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, - overwrite=False, - prefer_zarr=True, - publish=False, - ) - - assert exc_info.value.status_code == 409 - assert "coverage=2026-02-01..2026-02-10" in str(exc_info.value.detail) - assert "request=2026-01-01..2026-02-10" in str(exc_info.value.detail) - - def _icechunk_artifact( *, artifact_id: str = "ic1", diff --git a/tests/test_datasets_sync.py b/tests/test_datasets_sync.py index 6694f9cb..ce1fcc1c 100644 --- a/tests/test_datasets_sync.py +++ b/tests/test_datasets_sync.py @@ -142,7 +142,6 @@ def fake_create_artifact(**kwargs: object) -> ArtifactRecord: assert captured["start"] == "2026-01-01" assert captured["end"] == "2026-02-10" assert captured["bbox"] == [1.0, 2.0, 3.0, 4.0] - assert captured["country_code"] == "SLE" assert result.sync_id == "a2" assert result.status == "completed" assert result.message == "Managed dataset was rematerialized against the latest planned upstream state." @@ -805,7 +804,6 @@ def test_run_sync_raises_clear_error_when_append_invariants_are_missing(monkeypa latest_artifact=latest_artifact, source_dataset={"id": "chirps3_precipitation_daily", "period_type": "daily", "sync": {"kind": "temporal"}}, requested_end="2026-02-11", - country_code=None, prefer_zarr=True, publish=True, create_artifact_fn=lambda **_: pytest.fail("create_artifact should not be called"), @@ -813,55 +811,6 @@ def test_run_sync_raises_clear_error_when_append_invariants_are_missing(monkeypa ) -def test_sync_dataset_forwards_country_code_from_extent(monkeypatch: pytest.MonkeyPatch) -> None: - dataset_id = "worldpop_population_yearly_sle" - latest = _artifact( - artifact_id="a1", - source_dataset_id="worldpop_population_yearly", - managed_dataset_id=dataset_id, - end="2020", - ) - monkeypatch.setattr(services, "get_latest_artifact_for_dataset_or_404", lambda _: latest) - monkeypatch.setattr( - services.registry_datasets, - "get_dataset", - lambda _: {"id": "worldpop_population_yearly", "period_type": "yearly", "sync": {"kind": "release"}}, - ) - monkeypatch.setattr( - services, - "get_extent", - lambda: {"id": "sle", "bbox": [-13.5, 6.9, -10.1, 10.0], "country_code": "SLE"}, - ) - - captured: dict[str, object] = {} - - def fake_run_sync(**kwargs: object) -> SyncResponse: - captured.update(kwargs) - return SyncResponse( - sync_id="a2", - status="completed", - message="ok", - dataset=_dataset_detail(dataset_id), - sync_detail=SyncDetail( - source_dataset_id="worldpop_population_yearly", - sync_kind=SyncKind.RELEASE, - action=SyncAction.REMATERIALIZE, - reason="new_release_available", - message="ok", - current_start="2020", - current_end="2020", - target_end="2021", - target_end_source="request", - ), - ) - - monkeypatch.setattr(services, "run_sync", fake_run_sync) - - services.sync_dataset(dataset_id=dataset_id, end="2021", prefer_zarr=True, publish=True) - - assert captured["country_code"] == "SLE" - - # --------------------------------------------------------------------------- # Icechunk store-based sync # --------------------------------------------------------------------------- @@ -1071,17 +1020,18 @@ def _patch_icechunk_artifact_dependencies( captured: dict[str, object], ) -> None: """Patch all inline imports used by _create_icechunk_artifact.""" + import numpy as np + import xarray as xr + import climate_api.ingest.orchestrator as orchestrator_mod import climate_api.ingest.store as store_mod from climate_api.ingestions import services as svc - import xarray as xr - import numpy as np def fake_run_ingest_sync(**kwargs: object) -> None: captured.update(kwargs) monkeypatch.setattr(orchestrator_mod, "run_ingest_sync", fake_run_ingest_sync) - monkeypatch.setattr(orchestrator_mod, "load_plugin", lambda path, params: object()) + monkeypatch.setattr(orchestrator_mod, "load_plugin", lambda path, params, extra_params=None: object()) monkeypatch.setattr(store_mod, "open_or_create_repo", lambda _: _FakeRepo()) monkeypatch.setattr(svc, "coverage_from_open_dataset", lambda ds, **_: { "has_data": True, diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 398a3e40..d95e9611 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -7,7 +7,6 @@ import pytest import xarray as xr import zarr -from fastapi import HTTPException from topozarr.pyramid import Pyramid from xarray import DataTree @@ -46,109 +45,6 @@ def test_resolve_artifacts_dir_uses_xdg_when_no_config(monkeypatch: pytest.Monke assert ingestion_services._resolve_artifacts_dir() == Path(xdg) / "climate-api" / "artifacts" -def test_download_dataset_returns_400_when_country_code_is_required(monkeypatch: pytest.MonkeyPatch) -> None: - def fake_download( - *, - start: str, - end: str, - dirname: object, - prefix: str, - overwrite: bool, - country_code: str, - ) -> None: - del start, end, dirname, prefix, overwrite, country_code - - dataset: dict[str, Any] = { - "id": "worldpop_population_yearly", - "ingestion": {"function": "ignored.path"}, - } - monkeypatch.delenv("COUNTRY_CODE", raising=False) - monkeypatch.setattr(downloader, "_get_dynamic_function", lambda _: fake_download) - - with pytest.raises(HTTPException) as exc_info: - downloader.download_dataset( - dataset=dataset, - start="2020-01-01", - end="2020-12-31", - bbox=None, - country_code=None, - overwrite=False, - background_tasks=None, - ) - - assert exc_info.value.status_code == 400 - assert "requires a country code" in str(exc_info.value.detail) - - -def test_download_dataset_returns_400_for_missing_bbox(monkeypatch: pytest.MonkeyPatch) -> None: - def fake_download( - *, - start: str, - end: str, - dirname: object, - prefix: str, - overwrite: bool, - bbox: list[float], - ) -> None: - del start, end, dirname, prefix, overwrite, bbox - - dataset: dict[str, Any] = { - "id": "chirps3_precipitation_daily", - "ingestion": {"function": "ignored.path"}, - } - monkeypatch.delenv("DOWNLOAD_BBOX", raising=False) - monkeypatch.delenv("DEFAULT_DOWNLOAD_BBOX", raising=False) - monkeypatch.setattr(downloader, "_get_dynamic_function", lambda _: fake_download) - - with pytest.raises(HTTPException) as exc_info: - downloader.download_dataset( - dataset=dataset, - start="2020-01-01", - end="2020-01-31", - bbox=None, - country_code=None, - overwrite=False, - background_tasks=None, - ) - - assert exc_info.value.status_code == 400 - assert "A bbox is required" in str(exc_info.value.detail) - - -def test_download_dataset_returns_502_for_upstream_provider_failure(monkeypatch: pytest.MonkeyPatch) -> None: - def fake_download( - *, - start: str, - end: str, - dirname: object, - prefix: str, - overwrite: bool, - country_code: str, - ) -> None: - del start, end, dirname, prefix, overwrite, country_code - raise RuntimeError("provider timeout") - - dataset: dict[str, Any] = { - "id": "worldpop_population_yearly", - "ingestion": {"function": "ignored.path"}, - } - monkeypatch.setattr(downloader, "_get_dynamic_function", lambda _: fake_download) - - with pytest.raises(HTTPException) as exc_info: - downloader.download_dataset( - dataset=dataset, - start="2020-01-01", - end="2020-12-31", - bbox=None, - country_code="SLE", - overwrite=False, - background_tasks=None, - ) - - assert exc_info.value.status_code == 502 - assert "Upstream dataset download failed: provider timeout" == str(exc_info.value.detail) - - # --------------------------------------------------------------------------- # _get_cache_prefix # --------------------------------------------------------------------------- @@ -159,113 +55,6 @@ def test_get_cache_prefix_uses_dataset_id() -> None: assert downloader._get_cache_prefix(dataset) == "chirps3_precipitation_daily" -# --------------------------------------------------------------------------- -# _validate_spatial_coverage -# --------------------------------------------------------------------------- - - -_CHIRPS3_EXTENTS: dict[str, Any] = { - "spatial": {"bbox": [-180, -50, 180, 50], "crs": "http://www.opengis.net/def/crs/OGC/1.3/CRS84"} -} -_LIMITED_LON_EXTENTS: dict[str, Any] = { - "spatial": {"bbox": [-180, -90, 60, 90], "crs": "http://www.opengis.net/def/crs/OGC/1.3/CRS84"} -} - - -def test_validate_spatial_coverage_passes_when_no_extents_declared() -> None: - dataset: dict[str, Any] = {"id": "worldpop_population_yearly", "ingestion": {}} - downloader._validate_spatial_coverage(dataset, bbox=[4.5, 57.9, 31.1, 71.2]) - - -def test_validate_spatial_coverage_passes_when_no_bbox() -> None: - dataset: dict[str, Any] = {"id": "chirps3_precipitation_daily", "ingestion": {}, "extents": _CHIRPS3_EXTENTS} - downloader._validate_spatial_coverage(dataset, bbox=None) - - -def test_validate_spatial_coverage_passes_when_template_bbox_malformed() -> None: - extents: dict[str, Any] = {"spatial": {"bbox": "not-a-list"}} - dataset: dict[str, Any] = {"id": "bad_template", "ingestion": {}, "extents": extents} - downloader._validate_spatial_coverage(dataset, bbox=[-10.0, -10.0, 10.0, 10.0]) - - -def test_validate_spatial_coverage_passes_when_bbox_inside_extents() -> None: - dataset: dict[str, Any] = {"id": "chirps3_precipitation_daily", "ingestion": {}, "extents": _CHIRPS3_EXTENTS} - downloader._validate_spatial_coverage(dataset, bbox=[-10.0, -10.0, 10.0, 10.0]) - - -def test_validate_spatial_coverage_raises_when_bbox_outside_lat_extents() -> None: - dataset: dict[str, Any] = { - "id": "chirps3_precipitation_daily", - "ingestion": {}, - "extents": _CHIRPS3_EXTENTS, - } - with pytest.raises(HTTPException) as exc_info: - downloader._validate_spatial_coverage(dataset, bbox=[4.5, 57.9, 31.1, 71.2]) - assert exc_info.value.status_code == 400 - assert "does not cover this extent" in str(exc_info.value.detail) - assert "Latitude" in str(exc_info.value.detail) - - -def test_validate_spatial_coverage_raises_when_bbox_outside_lon_extents() -> None: - dataset: dict[str, Any] = { - "id": "some_dataset", - "ingestion": {}, - "extents": _LIMITED_LON_EXTENTS, - } - with pytest.raises(HTTPException) as exc_info: - downloader._validate_spatial_coverage(dataset, bbox=[70.0, -10.0, 90.0, 10.0]) - assert exc_info.value.status_code == 400 - assert "Longitude" in str(exc_info.value.detail) - - -def test_download_dataset_validates_env_bbox_against_extents( - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Coverage validation uses the env fallback bbox when no bbox is passed in the request.""" - dataset: dict[str, Any] = { - "id": "chirps3_precipitation_daily", - "ingestion": {"function": "ignored.path"}, - "extents": _CHIRPS3_EXTENTS, - } - monkeypatch.setenv("DOWNLOAD_BBOX", "4.5,57.9,31.1,71.2") - - with pytest.raises(HTTPException) as exc_info: - downloader.download_dataset( - dataset=dataset, - start="2020-01-01", - end="2020-01-31", - bbox=None, - country_code=None, - overwrite=False, - background_tasks=None, - ) - assert exc_info.value.status_code == 400 - assert "does not cover this extent" in str(exc_info.value.detail) - - -def test_download_dataset_returns_400_when_bbox_outside_dataset_extents( - monkeypatch: pytest.MonkeyPatch, -) -> None: - dataset: dict[str, Any] = { - "id": "chirps3_precipitation_daily", - "ingestion": {"function": "ignored.path"}, - "extents": _CHIRPS3_EXTENTS, - } - - with pytest.raises(HTTPException) as exc_info: - downloader.download_dataset( - dataset=dataset, - start="2020-01-01", - end="2020-01-31", - bbox=[4.5, 57.9, 31.1, 71.2], - country_code=None, - overwrite=False, - background_tasks=None, - ) - assert exc_info.value.status_code == 400 - assert "does not cover this extent" in str(exc_info.value.detail) - - # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- diff --git a/tests/test_ingest_orchestrator.py b/tests/test_ingest_orchestrator.py index a223b78b..b606cbda 100644 --- a/tests/test_ingest_orchestrator.py +++ b/tests/test_ingest_orchestrator.py @@ -474,8 +474,9 @@ def test_rechunk_store_changes_chunk_size(tmp_path: Path) -> None: def test_rechunk_store_skips_when_no_time_dimension(tmp_path: Path) -> None: """rechunk_store is a no-op when the store has no time dimension.""" import icechunk - import xarray as xr import numpy as np + import xarray as xr + from climate_api.ingest.store import rechunk_store store_path = tmp_path / "static.icechunk" diff --git a/tests/test_ingest_plugins.py b/tests/test_ingest_plugins.py index e622696b..2ed9523e 100644 --- a/tests/test_ingest_plugins.py +++ b/tests/test_ingest_plugins.py @@ -8,7 +8,6 @@ from __future__ import annotations import asyncio -import io from datetime import date from typing import Any from unittest.mock import MagicMock, patch @@ -20,7 +19,6 @@ from climate_api.ingest.protocol import GridSpec, IngestionPlugin - # --------------------------------------------------------------------------- # WorldPopPlugin # --------------------------------------------------------------------------- diff --git a/uv.lock b/uv.lock index ca02ddd9..4dc46e43 100644 --- a/uv.lock +++ b/uv.lock @@ -408,6 +408,7 @@ dependencies = [ { name = "geojson-pydantic" }, { name = "geozarr-toolkit" }, { name = "httpx" }, + { name = "icechunk" }, { name = "jinja2" }, { name = "metpy" }, { name = "portalocker" }, @@ -441,6 +442,7 @@ requires-dist = [ { name = "geojson-pydantic", specifier = ">=2.1.0" }, { name = "geozarr-toolkit", specifier = "==0.1.*" }, { name = "httpx", specifier = ">=0.28.1" }, + { name = "icechunk", specifier = ">=2.0,<3" }, { name = "jinja2", specifier = ">=3.1" }, { name = "metpy", specifier = ">=1.7,<2" }, { name = "portalocker", specifier = ">=3.2.0" }, @@ -1101,6 +1103,37 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] +[[package]] +name = "icechunk" +version = "2.0.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zarr" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/97/99/fabc9794c1f82e51b5c7c66301695b8fd920f72dee1726104dbdbd8df3e7/icechunk-2.0.5.tar.gz", hash = "sha256:50a2a44a1b561d3f2d3b5d19725c3759f300dc67225a2360fc793d894abfcab1", size = 3327412, upload-time = "2026-05-18T20:22:05.466Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/4e/73e0851289894ce7ba1d88e8ecc00f49dbf51220129ac3ab703a0a599eab/icechunk-2.0.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:9c7baeab6837a1e0aec8b5dd63f865b842d66d92314e02de40612190acdcaa2c", size = 16834379, upload-time = "2026-05-18T20:22:42.89Z" }, + { url = "https://files.pythonhosted.org/packages/2b/54/84e504554e9a502a4bed4d2d1e72ff0cd256e103e0c632a40e31d6c7fc9d/icechunk-2.0.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bd02bc6fb32c9c6477f45d6818e7e363d5365884d42c03d66d97801c9ed98726", size = 15538385, upload-time = "2026-05-18T20:22:33.645Z" }, + { url = "https://files.pythonhosted.org/packages/49/a9/3241119145b05beec05b45e46581faebdc02c14f5f923ebbb56ac6d0bdb0/icechunk-2.0.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de5105891c297c9a8c3ff96bef78125a576224eaf3b970e6f0aa6ec4d8cab876", size = 17229820, upload-time = "2026-05-18T20:22:23.591Z" }, + { url = "https://files.pythonhosted.org/packages/4e/ac/8392e6b23841aa81324144b7893bf7685137658848a2d8cffbd4955d89b1/icechunk-2.0.5-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:b73c5ce8987a6975970c050751d47b43af7ecaaf94c1dfa130446e3972b7f8bd", size = 16867941, upload-time = "2026-05-18T20:21:59.35Z" }, + { url = "https://files.pythonhosted.org/packages/ca/95/a323289e37ccdd6e4a7ddc8251681d921428c788c9b7f05e731d537015f0/icechunk-2.0.5-cp313-cp313-manylinux_2_28_armv7l.whl", hash = "sha256:53d7c7926251c8a45d1b526e1fe348619239c011920402f6466cfcfbcd96ba74", size = 16697076, upload-time = "2026-05-18T20:22:11.771Z" }, + { url = "https://files.pythonhosted.org/packages/2d/82/ef006b6433127a7b6aa9fbd9183cb2f4ce69df61096220e16d0292b563c9/icechunk-2.0.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:47883961b0b570eb206b3218ee285ad9e36dd407c5988ee39032356c73a90f40", size = 17086219, upload-time = "2026-05-18T20:22:53.068Z" }, + { url = "https://files.pythonhosted.org/packages/6e/42/0d1ede1a3cd383f2bf00cc8905816339885022efde31951015fb2e110885/icechunk-2.0.5-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:4779422967dcf69b2c2606211ae48b313e865a0caf3414afab790422ef1b5d7e", size = 16868204, upload-time = "2026-05-18T20:23:03.334Z" }, + { url = "https://files.pythonhosted.org/packages/ae/b3/baefe0939737277efbec5b97fe0f4286f53f81a423d7cab2e7d5128f1633/icechunk-2.0.5-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:83bf18ac31aace8325ce9cc1ee73581f007c0bf3061ad10b4b3ffff8b734977b", size = 16945422, upload-time = "2026-05-18T20:23:13.69Z" }, + { url = "https://files.pythonhosted.org/packages/be/1f/144799746b0b5269458c4a24049bae7f4d52da55735c10e173de5c76c134/icechunk-2.0.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ae7bb09eafeb714597d795e74506a9f258a798d7a26d840c37b3bfec44e988a4", size = 17637469, upload-time = "2026-05-18T20:23:23.199Z" }, + { url = "https://files.pythonhosted.org/packages/02/bc/1dec19138d4ab82175a8b2cddd24320003476c8e9e4d2cafa70b096d5b76/icechunk-2.0.5-cp313-cp313-win_amd64.whl", hash = "sha256:978ddc20fb1e6abfaacdb31810a37a83b94b488d7795e77931576f220930f72a", size = 15936134, upload-time = "2026-05-18T20:23:37.825Z" }, + { url = "https://files.pythonhosted.org/packages/e3/47/0e29dd5248dbaddef6351e9807d61c4eacae325f3b1415a2bb0ce5b2e92e/icechunk-2.0.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:2d2369db67502118150612ad481468cc0ae1333ba5cf6084179da04591e566b2", size = 16841657, upload-time = "2026-05-18T20:22:45.594Z" }, + { url = "https://files.pythonhosted.org/packages/2f/c5/9652585ee78a0f242d2c92983a550990b6a39779d6d0a7c24ab82f293d39/icechunk-2.0.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0554ef924b14f7d6369919c22f042a817f8dbc85d0b9efd793398e2d44786fa4", size = 15544742, upload-time = "2026-05-18T20:22:36.879Z" }, + { url = "https://files.pythonhosted.org/packages/59/2c/076e478b9b45616ef268bdb0473d6d0c8bfc4ad62224477c0c066b74ebe0/icechunk-2.0.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43289b68bc3ca93804a8ffd96e356f509c87bfd35c3d8202382dd97febd9957d", size = 17240728, upload-time = "2026-05-18T20:22:26.306Z" }, + { url = "https://files.pythonhosted.org/packages/fa/06/8c0f3fb0df245f42d05d4b423a92766a466ad9e36711972da84196263d14/icechunk-2.0.5-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:35812c0c3087688c422470610fa9f77f6edb2d5d7c3345e0610beb32c8028f96", size = 16883484, upload-time = "2026-05-18T20:22:02.374Z" }, + { url = "https://files.pythonhosted.org/packages/0f/6a/74440741ac30bb09c9d3fb74acb1332f218ae87faba24f6a81c8c575b9d2/icechunk-2.0.5-cp314-cp314-manylinux_2_28_armv7l.whl", hash = "sha256:ba30ea180b056c34fcf094b36fd7fb7cbd2521f778e5d1080ce9ab9b2f28204d", size = 16706325, upload-time = "2026-05-18T20:22:15.083Z" }, + { url = "https://files.pythonhosted.org/packages/4b/09/fe20275108f44a7ad8bbb904165feaf65f31796a8dad7baa764d86181fa9/icechunk-2.0.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:65075bce7674d2292344a661129fa987a579e5ab46c35862ef366b0c167e1066", size = 17099016, upload-time = "2026-05-18T20:22:56.368Z" }, + { url = "https://files.pythonhosted.org/packages/fb/f6/bd9b9d79b1a10a8382116ac018f3580f6e8021674fe53167829fba70bd7e/icechunk-2.0.5-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:231f8a46de0949da8ffd0e59446342a1051e374276385ca9a529ab593e73c7ce", size = 16878446, upload-time = "2026-05-18T20:23:06.761Z" }, + { url = "https://files.pythonhosted.org/packages/c6/af/025f9c303dca742c709a8c01f809479c3d22bdf630954f08eadb6eaace5b/icechunk-2.0.5-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:65e514a6394b1ed5f3726d7da1cc3cafae8e23c7a6b243dc793b1eb876078813", size = 16955472, upload-time = "2026-05-18T20:23:16.865Z" }, + { url = "https://files.pythonhosted.org/packages/52/b3/7429c90e9a0512f6e11443ccbc9d4ee392757b53c8604db05578457ffac9/icechunk-2.0.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e1a488e9162d64e05e995aa4015cc8118c68fc54d0ed5af4616627a66a4d1d01", size = 17646740, upload-time = "2026-05-18T20:23:30.02Z" }, + { url = "https://files.pythonhosted.org/packages/25/3c/8e086299cc1a779837e65e4152d03d02b457f8974bb388082fef20894b5e/icechunk-2.0.5-cp314-cp314-win_amd64.whl", hash = "sha256:95b1beb874ad287fcb99dfea29cd5218c795b5d9bca47b8f43ef78b8e6c5b572", size = 15945027, upload-time = "2026-05-18T20:23:40.81Z" }, +] + [[package]] name = "idna" version = "3.11" From dfbc9affd2357ab818836d8cf5e5a0c2da4d2ec8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 21:22:09 +0200 Subject: [PATCH 12/80] fix: strip CF encoding attrs (add_offset, scale_factor) from WorldPop datasets WorldPop GeoTIFFs carry scale_factor/add_offset in variable attributes. xarray rejects these as conflicting CF encoding keys when appending to a Zarr store on the second period. Strip all CF encoding keys from attrs after loading so append_dim writes succeed. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/plugins/worldpop.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/climate_api/ingest/plugins/worldpop.py b/climate_api/ingest/plugins/worldpop.py index 3c312823..182714d5 100644 --- a/climate_api/ingest/plugins/worldpop.py +++ b/climate_api/ingest/plugins/worldpop.py @@ -105,8 +105,10 @@ def _fetch_sync(self, year: int, bbox: list[float]) -> xr.Dataset: ds = da.to_dataset(name="pop_total") ds = ds.expand_dims(time=[np.datetime64(f"{year}-01-01", "D")]) + _CF_ENCODING_KEYS = {"scale_factor", "add_offset", "missing_value", "_FillValue", "coordinates"} for name in list(ds.data_vars) + list(ds.coords): ds[name].encoding.clear() + ds[name].attrs = {k: v for k, v in ds[name].attrs.items() if k not in _CF_ENCODING_KEYS} ds["time"].encoding.update({"units": "days since 1970-01-01", "dtype": "int32"}) return ds From dcd0ca91c3d2a8ee124d5906bc26868a8ea6c77e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 21:46:53 +0200 Subject: [PATCH 13/80] fix: detect actual data CRS for STAC proj:code and coverage spatial_wgs84 Datasets like WorldPop are stored in WGS84 (EPSG:4326) while a deployment may be configured with a projected CRS (e.g. EPSG:32633). Using the deployment CRS as proj:code caused the map viewer to wrongly treat WGS84 coordinates as projected, breaking rendering. spatial_wgs84 coverage was also computed by reprojecting already-WGS84 coordinates as if they were UTM metres. Fix: read the actual CRS from the dataset's spatial_ref coordinate (written by rioxarray) and use it for both proj:code in the STAC collection and native_crs in coverage computation. Falls back to the configured deployment CRS when no spatial_ref is present. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingestions/services.py | 18 +++++++++++++++- climate_api/stac/services.py | 34 +++++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index e2882537..2f47ec08 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -50,6 +50,22 @@ logger = logging.getLogger(__name__) +def _read_crs_from_spatial_ref(ds: object) -> str | None: + """Return 'EPSG:' from a dataset's spatial_ref coordinate, or None.""" + if "spatial_ref" not in ds.coords: + return None + try: + import pyproj + attrs = dict(ds["spatial_ref"].attrs) + wkt = attrs.get("crs_wkt") or attrs.get("spatial_ref") + if not wkt: + return None + epsg = pyproj.CRS.from_wkt(str(wkt)).to_epsg() + return f"EPSG:{epsg}" if epsg else None + except Exception: + return None + + def _resolve_artifacts_dir() -> Path: from climate_api import config as api_config @@ -272,7 +288,7 @@ def _create_icechunk_artifact( ds = xr.open_zarr(session.store) from climate_api import config as api_config - native_crs = api_config.get_crs() or "EPSG:4326" + native_crs = _read_crs_from_spatial_ref(ds) or api_config.get_crs() or "EPSG:4326" coverage_data = coverage_from_open_dataset(ds, period_type=period_type, native_crs=native_crs) ds.close() diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index 022ed8d9..e6b8740e 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -212,13 +212,23 @@ def _build_collection_with_xstac(*, artifact: ArtifactRecord, template: pystac.C try: x_dimension, y_dimension = get_x_y_dims(ds) time_dimension = get_time_dim(ds) + # Detect the actual data CRS so proj:code reflects the store's native coordinate + # system rather than the deployment CRS. This matters when a dataset (e.g. WorldPop) + # is stored in WGS84 while the deployment is configured for a projected CRS. + detected_crs = _detect_dataset_crs(ds) + if detected_crs: + template.extra_fields["proj:code"] = detected_crs + try: + reference_system = int(detected_crs.split(":")[-1]) if detected_crs else 4326 + except ValueError: + reference_system = 4326 result = xarray_to_stac( ds, template, temporal_dimension=time_dimension, x_dimension=x_dimension, y_dimension=y_dimension, - reference_system=4326, + reference_system=reference_system, # Schema validation can trigger outbound fetches for STAC extension schemas. validate=False, ) @@ -478,3 +488,25 @@ def _zarr_consolidated_flag(artifact_path: str) -> bool | None: if (store_root / ".zgroup").exists(): return False return None + + +def _detect_dataset_crs(ds: Any) -> str | None: + """Read the EPSG CRS code from a dataset's spatial_ref coordinate, if present. + + Returns a string like 'EPSG:4326' or None if undetectable. Used to override + the deployment-wide proj:code with the actual native CRS of the data so that + datasets stored in WGS84 (e.g. WorldPop) are not misidentified as projected. + """ + if "spatial_ref" not in ds.coords: + return None + try: + import pyproj + attrs = dict(ds["spatial_ref"].attrs) + wkt = attrs.get("crs_wkt") or attrs.get("spatial_ref") + if not wkt: + return None + crs = pyproj.CRS.from_wkt(str(wkt)) + epsg = crs.to_epsg() + return f"EPSG:{epsg}" if epsg else None + except Exception: + return None From 9c7ad600b50e1bcace4c863c04a29f39fbec173f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 22:00:47 +0200 Subject: [PATCH 14/80] fix: pass overwrite flag through _create_icechunk_artifact to upsert _store_artifact_record silently returns the existing record when one already exists, so re-runs with overwrite=True never updated coverage or CRS data. Switch to _upsert_artifact_record (which replaces the record when overwrite=True) and thread the flag through the call chain. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingestions/services.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 2f47ec08..12a5a8a8 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -218,6 +218,7 @@ def create_artifact( end=resolved_download_end, bbox=bbox, request_scope=request_scope, + overwrite=overwrite, publish=publish, ingest_start=download_start, ) @@ -230,6 +231,7 @@ def _create_icechunk_artifact( end: str, bbox: list[float] | None, request_scope: ArtifactRequestScope, + overwrite: bool = False, publish: bool, ingest_start: str | None = None, ) -> ArtifactRecord: @@ -316,7 +318,7 @@ def _create_icechunk_artifact( created_at=datetime.now(UTC), publication=ArtifactPublication(), ) - stored = _store_artifact_record(record, prefer_zarr=False, publish=publish) + stored = _upsert_artifact_record(record, prefer_zarr=False, publish=publish, overwrite=overwrite) logger.info( "Stored Icechunk artifact '%s' for '%s': coverage=%s..%s", stored.artifact_id, From a8f8d1ccc8974878f2cbf6b8f37aade0e7258840 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 22:10:38 +0200 Subject: [PATCH 15/80] fix: Icechunk zarr key serving and STAC zarr_format detection - _serve_icechunk_key: IcechunkStore is not subscriptable in icechunk v2; use _get_bytes_sync(key, prototype=...) instead of store[key] - _zarr_asset_metadata: Icechunk stores have no zarr.json in the root directory, so the format was never detected as v3; explicitly set zarr:zarr_format=3 for ICECHUNK artifacts so the map viewer uses zarr v3 key paths (zarr.json) instead of zarr v2 (.zmetadata/.zattrs) Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingestions/services.py | 8 ++++++-- climate_api/stac/services.py | 3 +++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 12a5a8a8..59b02d03 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -654,8 +654,12 @@ def _serve_icechunk_key( } try: - data: bytes = bytes(session.store[key]) # type: ignore[index] - except KeyError: + import zarr.core.buffer + proto = zarr.core.buffer.default_buffer_prototype() + data = session.store._get_bytes_sync(key, prototype=proto) + if data is None: + raise HTTPException(status_code=404, detail=f"Zarr key '{relative_path}' not found in store") + except (KeyError, FileNotFoundError): raise HTTPException(status_code=404, detail=f"Zarr key '{relative_path}' not found in store") if key.endswith("zarr.json"): diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index e6b8740e..528f73d4 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -425,6 +425,9 @@ def _keywords(artifact: ArtifactRecord, source_dataset: dict[str, Any]) -> list[ def _zarr_asset_metadata(artifact: ArtifactRecord) -> dict[str, object]: metadata: dict[str, object] = {"zarr:node_type": "group"} + if artifact.format == ArtifactFormat.ICECHUNK: + metadata["zarr:zarr_format"] = 3 + return metadata artifact_path = _artifact_store_path(artifact) consolidated = _zarr_consolidated_flag(artifact_path) if consolidated is not None: From bb2173cc40ce926fdfb979414eec29576a60c5ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 22:18:48 +0200 Subject: [PATCH 16/80] =?UTF-8?q?fix:=20WorldPop=20nodata=20handling=20?= =?UTF-8?q?=E2=80=94=20mask=20-99999=20to=20NaN=20during=20ingest?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The raw WorldPop GeoTIFFs use -99999 as the nodata sentinel. Previously this was stored as-is in the Icechunk store (no masking applied), and the STAC renders.nodata was set to 0.0 which never matched the actual values. - Fix display.nodata to -99999 so the map viewer's fillValue correctly marks those pixels as transparent - Mask da.rio.nodata (-99999) → NaN in _fetch_sync before writing, so future ingests store proper NaN nodata (zarr _FillValue: nan) - Fix _probe_estimate nodata to float("nan") to match Co-Authored-By: Claude Sonnet 4.6 --- climate_api/data/datasets/worldpop.yaml | 2 +- climate_api/ingest/plugins/worldpop.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/climate_api/data/datasets/worldpop.yaml b/climate_api/data/datasets/worldpop.yaml index 170780cd..bc648f78 100644 --- a/climate_api/data/datasets/worldpop.yaml +++ b/climate_api/data/datasets/worldpop.yaml @@ -28,4 +28,4 @@ display: colormap: reds range: [0.0, 25.0] - nodata: 0.0 + nodata: -99999 diff --git a/climate_api/ingest/plugins/worldpop.py b/climate_api/ingest/plugins/worldpop.py index 182714d5..cc50e5f4 100644 --- a/climate_api/ingest/plugins/worldpop.py +++ b/climate_api/ingest/plugins/worldpop.py @@ -100,6 +100,10 @@ def _fetch_sync(self, year: int, bbox: list[float]) -> xr.Dataset: xmin, ymin, xmax, ymax = map(float, bbox) da = da.rio.clip_box(minx=xmin, miny=ymin, maxx=xmax, maxy=ymax) da = da.squeeze("band", drop=True) + # Mask the rioxarray nodata sentinel (-99999) to NaN before writing. + _nodata = da.rio.nodata + if _nodata is not None and not math.isnan(float(_nodata)): + da = da.where(da != _nodata) da = da.load() ds = da.to_dataset(name="pop_total") @@ -121,7 +125,7 @@ def _probe_estimate(self, bbox: list[float]) -> GridSpec: shape=(ny, nx), crs=4326, dtype=np.dtype("float32"), - nodata=0.0, + nodata=float("nan"), time_dim=True, ) From 53789a937f30207671e562da75e2c84eed4ad16f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 22:40:20 +0200 Subject: [PATCH 17/80] chore: remove display.nodata from worldpop dataset Now that the plugin masks the -99999 source sentinel to NaN before writing, the zarr array's fill_value: NaN handles transparency directly. No explicit nodata hint is needed in the STAC renders. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/data/datasets/worldpop.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/climate_api/data/datasets/worldpop.yaml b/climate_api/data/datasets/worldpop.yaml index bc648f78..fcdaa921 100644 --- a/climate_api/data/datasets/worldpop.yaml +++ b/climate_api/data/datasets/worldpop.yaml @@ -28,4 +28,3 @@ display: colormap: reds range: [0.0, 25.0] - nodata: -99999 From 13e130997c2a603832f700d512932714dbfb1f52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 23:00:51 +0200 Subject: [PATCH 18/80] Code cleaning --- .../data_manager/services/downloader.py | 3 -- .../data_registry/services/datasets.py | 4 +- climate_api/ingest/orchestrator.py | 18 ++++++++ climate_api/ingest/plugins/chirps3.py | 8 +--- climate_api/ingest/plugins/era5_land.py | 4 +- climate_api/ingest/plugins/worldpop.py | 20 ++------- climate_api/ingest/store.py | 5 +-- climate_api/ingestions/services.py | 41 ++++++++++++++++--- climate_api/publications/services.py | 3 +- climate_api/stac/services.py | 1 + tests/test_datasets_sync.py | 29 +++++++++---- tests/test_ingest_orchestrator.py | 9 ++++ tests/test_ingest_plugins.py | 8 +--- 13 files changed, 94 insertions(+), 59 deletions(-) diff --git a/climate_api/data_manager/services/downloader.py b/climate_api/data_manager/services/downloader.py index 67a340ae..b525e288 100644 --- a/climate_api/data_manager/services/downloader.py +++ b/climate_api/data_manager/services/downloader.py @@ -33,7 +33,6 @@ def _resolve_download_dir() -> Path: DOWNLOAD_DIR = _resolve_download_dir() - def build_dataset_zarr(dataset: dict[str, Any], *, start: str | None = None, end: str | None = None) -> None: """Collect dataset cache files into one optimised Zarr archive, clipped to request scope.""" logger.info(f"Optimizing cache for dataset {dataset['id']}") @@ -271,5 +270,3 @@ def _get_dynamic_function(full_path: str) -> Callable[..., Any]: function_name = parts[-1] module = importlib.import_module(module_path) return getattr(module, function_name) # type: ignore[no-any-return] - - diff --git a/climate_api/data_registry/services/datasets.py b/climate_api/data_registry/services/datasets.py index b3a1f354..b9e257ee 100644 --- a/climate_api/data_registry/services/datasets.py +++ b/climate_api/data_registry/services/datasets.py @@ -154,9 +154,7 @@ def _validate_dataset_template(dataset: object, *, source: str) -> None: raise ValueError(f"Dataset template '{dataset_id}' in {source} must define an 'ingestion' block") plugin = ingestion.get("plugin") if not (isinstance(plugin, str) and plugin): - raise ValueError( - f"Dataset template '{dataset_id}' in {source} must define ingestion.plugin" - ) + raise ValueError(f"Dataset template '{dataset_id}' in {source} must define ingestion.plugin") sync_availability = sync_block.get("availability") if isinstance(sync_block, dict) else None if sync_availability is not None: diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py index e353b78a..d8e88f6b 100644 --- a/climate_api/ingest/orchestrator.py +++ b/climate_api/ingest/orchestrator.py @@ -28,6 +28,23 @@ logger = logging.getLogger(__name__) +_CF_ENCODING_KEYS = frozenset({"scale_factor", "add_offset", "missing_value", "_FillValue", "coordinates"}) + + +def _strip_cf_encoding(ds: xr.Dataset, period_type: str) -> None: + """Strip CF attrs and clear encoding to prevent zarr append conflicts. + + GeoTIFF-sourced arrays carry scale_factor/add_offset/_FillValue in both + .encoding and .attrs. xarray raises ValueError when appending to zarr if + those keys collide with the stored array metadata from a prior write. + """ + for name in list(ds.data_vars) + list(ds.coords): + ds[name].encoding.clear() + ds[name].attrs = {k: v for k, v in ds[name].attrs.items() if k not in _CF_ENCODING_KEYS} + if "time" in ds.coords: + units = "hours since 1970-01-01" if period_type == "hourly" else "days since 1970-01-01" + ds["time"].encoding.update({"units": units, "dtype": "int32"}) + def load_plugin( dotted_path: str, @@ -140,6 +157,7 @@ async def _fetch(period_id: str) -> xr.Dataset: ds = await task period_id = pending[i] + _strip_cf_encoding(ds, period_type=period_type) # Each period uses its own writable session so that to_zarr(append_dim=) # on the next period reads the committed store and finds the time axis. diff --git a/climate_api/ingest/plugins/chirps3.py b/climate_api/ingest/plugins/chirps3.py index 2faf90b4..ccba2095 100644 --- a/climate_api/ingest/plugins/chirps3.py +++ b/climate_api/ingest/plugins/chirps3.py @@ -77,9 +77,7 @@ async def periods(self, start: str, end: str) -> list[str]: return self._build_periods(start, end) async def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: - return await asyncio.get_running_loop().run_in_executor( - _executor, self._fetch_sync, period_id, bbox - ) + return await asyncio.get_running_loop().run_in_executor(_executor, self._fetch_sync, period_id, bbox) # ------------------------------------------------------------------ # Sync helpers (run inside the thread pool) @@ -115,10 +113,6 @@ def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: ds = da.to_dataset(name="precip") ds = ds.expand_dims(time=[np.datetime64(period_id, "D")]) - - for name in list(ds.data_vars) + list(ds.coords): - ds[name].encoding.clear() - ds["time"].encoding.update({"units": "days since 1970-01-01", "dtype": "int32"}) return ds def _probe_estimate(self, bbox: list[float]) -> GridSpec: diff --git a/climate_api/ingest/plugins/era5_land.py b/climate_api/ingest/plugins/era5_land.py index a882a2a7..5c3a905f 100644 --- a/climate_api/ingest/plugins/era5_land.py +++ b/climate_api/ingest/plugins/era5_land.py @@ -63,9 +63,7 @@ async def periods(self, start: str, end: str) -> list[str]: async def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: """Fetch one hourly period from the remote zarr store.""" - return await asyncio.get_running_loop().run_in_executor( - _executor, self._fetch_sync, period_id, bbox - ) + return await asyncio.get_running_loop().run_in_executor(_executor, self._fetch_sync, period_id, bbox) # ------------------------------------------------------------------ # Sync helpers (run inside the thread pool) diff --git a/climate_api/ingest/plugins/worldpop.py b/climate_api/ingest/plugins/worldpop.py index cc50e5f4..4aca4ed4 100644 --- a/climate_api/ingest/plugins/worldpop.py +++ b/climate_api/ingest/plugins/worldpop.py @@ -62,9 +62,7 @@ async def periods(self, start: str, end: str) -> list[str]: return self._build_periods(start, end) async def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: - return await asyncio.get_running_loop().run_in_executor( - _executor, self._fetch_sync, int(period_id), bbox - ) + return await asyncio.get_running_loop().run_in_executor(_executor, self._fetch_sync, int(period_id), bbox) # ------------------------------------------------------------------ # Sync helpers (run inside the thread pool) @@ -80,10 +78,7 @@ def _url_for_year(self, year: int) -> str: ) if self.version == "global1": filename = f"{cc.lower()}_ppp_{year}_UNadj.tif" - return ( - f"https://data.worldpop.org/GIS/Population/Global_2000_2020/" - f"{year}/{cc}/{filename}" - ) + return f"https://data.worldpop.org/GIS/Population/Global_2000_2020/{year}/{cc}/{filename}" raise ValueError(f"Unknown WorldPop version: {self.version!r}") def _fetch_sync(self, year: int, bbox: list[float]) -> xr.Dataset: @@ -108,12 +103,6 @@ def _fetch_sync(self, year: int, bbox: list[float]) -> xr.Dataset: ds = da.to_dataset(name="pop_total") ds = ds.expand_dims(time=[np.datetime64(f"{year}-01-01", "D")]) - - _CF_ENCODING_KEYS = {"scale_factor", "add_offset", "missing_value", "_FillValue", "coordinates"} - for name in list(ds.data_vars) + list(ds.coords): - ds[name].encoding.clear() - ds[name].attrs = {k: v for k, v in ds[name].attrs.items() if k not in _CF_ENCODING_KEYS} - ds["time"].encoding.update({"units": "days since 1970-01-01", "dtype": "int32"}) return ds def _probe_estimate(self, bbox: list[float]) -> GridSpec: @@ -138,7 +127,4 @@ def _build_periods(self, start: str, end: str) -> list[str]: start_year = int(start[:4]) end_year = int(end[:4]) valid_range = (2015, 2030) if self.version == "global2" else (2000, 2020) - return [ - str(y) - for y in range(max(start_year, valid_range[0]), min(end_year, valid_range[1]) + 1) - ] + return [str(y) for y in range(max(start_year, valid_range[0]), min(end_year, valid_range[1]) + 1)] diff --git a/climate_api/ingest/store.py b/climate_api/ingest/store.py index 564cfde1..d6c40233 100644 --- a/climate_api/ingest/store.py +++ b/climate_api/ingest/store.py @@ -56,10 +56,7 @@ def rechunk_store(store_path: Path, *, time_chunk: int) -> None: new_chunks = list(current) new_chunks[list(da.dims).index("time")] = effective_chunk else: - new_chunks = [ - effective_chunk if dim == "time" else da.sizes[dim] - for dim in da.dims - ] + new_chunks = [effective_chunk if dim == "time" else da.sizes[dim] for dim in da.dims] existing["chunks"] = new_chunks encoding[name] = existing diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 59b02d03..06993aa5 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -50,12 +50,36 @@ logger = logging.getLogger(__name__) +def _check_bbox_overlap(dataset: dict[str, object], instance_bbox: list[float]) -> None: + """Raise HTTP 400 if the dataset's declared spatial extent does not overlap the instance bbox.""" + extents = dataset.get("extents") + if not isinstance(extents, dict): + return + spatial = extents.get("spatial") + if not isinstance(spatial, dict): + return + dataset_bbox = spatial.get("bbox") + if not (isinstance(dataset_bbox, list) and len(dataset_bbox) == 4): + return + dx_min, dy_min, dx_max, dy_max = (float(v) for v in dataset_bbox) + ix_min, iy_min, ix_max, iy_max = (float(v) for v in instance_bbox) + if dx_max <= ix_min or dx_min >= ix_max or dy_max <= iy_min or dy_min >= iy_max: + raise HTTPException( + status_code=400, + detail=( + f"Dataset '{dataset.get('id')}' spatial extent {dataset_bbox} " + f"does not overlap the configured instance extent {instance_bbox}" + ), + ) + + def _read_crs_from_spatial_ref(ds: object) -> str | None: """Return 'EPSG:' from a dataset's spatial_ref coordinate, or None.""" if "spatial_ref" not in ds.coords: return None try: import pyproj + attrs = dict(ds["spatial_ref"].attrs) wkt = attrs.get("crs_wkt") or attrs.get("spatial_ref") if not wkt: @@ -252,9 +276,10 @@ def _create_icechunk_artifact( params = dict(ingestion.get("params") or {}) extent = get_extent() - resolved_bbox: list[float] = list(bbox) if bbox is not None else ( - list(extent["bbox"]) if extent else [-180, -90, 180, 90] + resolved_bbox: list[float] = ( + list(bbox) if bbox is not None else (list(extent["bbox"]) if extent else [-180, -90, 180, 90]) ) + _check_bbox_overlap(dataset, resolved_bbox) store_path = downloader.DOWNLOAD_DIR / f"{dataset_id}.icechunk" extent_country_code = extent.get("country_code") if extent else None @@ -270,7 +295,12 @@ def _create_icechunk_artifact( rechunk_time: int | None = getattr(plugin, "rechunk_time", None) if ingest_start is None else None logger.info( "Running Icechunk ingest for '%s': ingest_scope=%s..%s artifact_scope=%s..%s rechunk_time=%s", - dataset_id, effective_start, end, start, end, rechunk_time, + dataset_id, + effective_start, + end, + start, + end, + rechunk_time, ) run_ingest_sync( plugin=plugin, @@ -614,9 +644,7 @@ def get_dataset_zarr_store_file_or_404( return FileResponse(target, media_type=media_type, filename=target.name) -def _serve_icechunk_key( - dataset_id: str, artifact: ArtifactRecord, relative_path: str -) -> Response | dict[str, object]: +def _serve_icechunk_key(dataset_id: str, artifact: ArtifactRecord, relative_path: str) -> Response | dict[str, object]: """Serve a zarr v3 key from an Icechunk store via its session store.""" import zarr @@ -655,6 +683,7 @@ def _serve_icechunk_key( try: import zarr.core.buffer + proto = zarr.core.buffer.default_buffer_prototype() data = session.store._get_bytes_sync(key, prototype=proto) if data is None: diff --git a/climate_api/publications/services.py b/climate_api/publications/services.py index 5637b6ec..8e254de4 100644 --- a/climate_api/publications/services.py +++ b/climate_api/publications/services.py @@ -58,7 +58,8 @@ def publish_artifact(record: ArtifactRecord) -> ArtifactRecord: "collection_id": collection_id, "published_at": datetime.now(UTC), # Pyramid zarr and Icechunk stores are served via the /zarr endpoint, not pygeoapi. - "pygeoapi_path": None if (is_pyramid_zarr or is_icechunk) + "pygeoapi_path": None + if (is_pyramid_zarr or is_icechunk) else f"/ogcapi/collections/{collection_id}", } ) diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index 528f73d4..80b2900d 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -504,6 +504,7 @@ def _detect_dataset_crs(ds: Any) -> str | None: return None try: import pyproj + attrs = dict(ds["spatial_ref"].attrs) wkt = attrs.get("crs_wkt") or attrs.get("spatial_ref") if not wkt: diff --git a/tests/test_datasets_sync.py b/tests/test_datasets_sync.py index ce1fcc1c..25d71f36 100644 --- a/tests/test_datasets_sync.py +++ b/tests/test_datasets_sync.py @@ -938,8 +938,14 @@ def test_sync_dataset_reads_committed_end_from_icechunk_store(monkeypatch: pytes monkeypatch.setattr( ingest_store, "read_committed_period_ids", - lambda path, period_type: {"2024-01-01T00", "2024-01-01T01", "2024-01-01T02", "2024-01-01T03", - "2024-01-01T04", "2024-01-01T05"}, + lambda path, period_type: { + "2024-01-01T00", + "2024-01-01T01", + "2024-01-01T02", + "2024-01-01T03", + "2024-01-01T04", + "2024-01-01T05", + }, ) captured: dict[str, object] = {} @@ -1033,15 +1039,20 @@ def fake_run_ingest_sync(**kwargs: object) -> None: monkeypatch.setattr(orchestrator_mod, "run_ingest_sync", fake_run_ingest_sync) monkeypatch.setattr(orchestrator_mod, "load_plugin", lambda path, params, extra_params=None: object()) monkeypatch.setattr(store_mod, "open_or_create_repo", lambda _: _FakeRepo()) - monkeypatch.setattr(svc, "coverage_from_open_dataset", lambda ds, **_: { - "has_data": True, - "coverage": { - "temporal": {"start": "2024-01-01T00", "end": "2024-01-01T06"}, - "spatial": {"xmin": 4.0, "ymin": 57.5, "xmax": 31.5, "ymax": 71.5}, + monkeypatch.setattr( + svc, + "coverage_from_open_dataset", + lambda ds, **_: { + "has_data": True, + "coverage": { + "temporal": {"start": "2024-01-01T00", "end": "2024-01-01T06"}, + "spatial": {"xmin": 4.0, "ymin": 57.5, "xmax": 31.5, "ymax": 71.5}, + }, }, - }) + ) monkeypatch.setattr( - xr, "open_zarr", + xr, + "open_zarr", lambda *_a, **_k: xr.Dataset({"t2m": xr.DataArray(np.zeros((1,)), dims=["time"])}), ) monkeypatch.setattr(svc, "get_extent", lambda: None) diff --git a/tests/test_ingest_orchestrator.py b/tests/test_ingest_orchestrator.py index b606cbda..54d66ba5 100644 --- a/tests/test_ingest_orchestrator.py +++ b/tests/test_ingest_orchestrator.py @@ -23,6 +23,7 @@ # Helpers # --------------------------------------------------------------------------- + def _make_monthly_dataset(period_id: str, ny: int = 4, nx: int = 4) -> xr.Dataset: """Return a tiny single-period dataset matching FakePlugin's grid.""" t = pd.Timestamp(f"{period_id}-01") @@ -37,6 +38,7 @@ def _make_monthly_dataset(period_id: str, ny: int = 4, nx: int = 4) -> xr.Datase # Fake plugin # --------------------------------------------------------------------------- + class FakePlugin: """In-memory IngestionPlugin that generates tiny xarray Datasets.""" @@ -62,6 +64,7 @@ async def fetch_period(self, period_id: str, bbox: list[float], **params: Any) - # Protocol conformance # --------------------------------------------------------------------------- + def test_fake_plugin_satisfies_protocol() -> None: plugin = FakePlugin(["2024-01", "2024-02"]) assert isinstance(plugin, IngestionPlugin) @@ -71,6 +74,7 @@ def test_fake_plugin_satisfies_protocol() -> None: # Core orchestrator tests # --------------------------------------------------------------------------- + def test_run_ingest_writes_all_periods(tmp_path: Path) -> None: plugin = FakePlugin(["2024-01", "2024-02", "2024-03"]) store_path = tmp_path / "test.icechunk" @@ -280,6 +284,7 @@ def cancel_after_two() -> bool: # Sync wrapper # --------------------------------------------------------------------------- + def test_run_ingest_sync_wrapper(tmp_path: Path) -> None: plugin = FakePlugin(["2024-01", "2024-02"]) store_path = tmp_path / "test.icechunk" @@ -301,6 +306,7 @@ def test_run_ingest_sync_wrapper(tmp_path: Path) -> None: # load_plugin # --------------------------------------------------------------------------- + def test_load_plugin_imports_and_instantiates(tmp_path: Path) -> None: """load_plugin can resolve built-in plugins by dotted path.""" plugin = load_plugin("climate_api.ingest.plugins.era5_land.Era5LandPlugin", {"variable": "t2m"}) @@ -322,6 +328,7 @@ def test_load_plugin_raises_for_non_protocol() -> None: # read_committed_period_ids # --------------------------------------------------------------------------- + def test_read_committed_period_ids_empty_when_no_store(tmp_path: Path) -> None: assert read_committed_period_ids(tmp_path / "nostore.icechunk", "monthly") == set() @@ -330,6 +337,7 @@ def test_read_committed_period_ids_empty_when_no_store(tmp_path: Path) -> None: # Era5LandPlugin._build_periods (unit tests, no network) # --------------------------------------------------------------------------- + def test_era5land_build_periods_respects_hour_component() -> None: from climate_api.ingest.plugins.era5_land import Era5LandPlugin @@ -358,6 +366,7 @@ def test_era5land_build_periods_spans_months() -> None: # Rechunking # --------------------------------------------------------------------------- + def _time_chunk_size(store_path: Path) -> int: """Read the time chunk size of the first data variable from the committed store.""" import icechunk diff --git a/tests/test_ingest_plugins.py b/tests/test_ingest_plugins.py index 2ed9523e..f92ef1bb 100644 --- a/tests/test_ingest_plugins.py +++ b/tests/test_ingest_plugins.py @@ -162,9 +162,7 @@ def test_fetch_period_returns_dataset_with_time_and_pop_total(self) -> None: fake_resp.raise_for_status = lambda: None fake_resp.content = b"" - with patch("requests.get", return_value=fake_resp), patch( - "rioxarray.open_rasterio", return_value=fake_da - ): + with patch("requests.get", return_value=fake_resp), patch("rioxarray.open_rasterio", return_value=fake_da): ds = WorldPopPlugin(country_code="NOR")._fetch_sync(2024, [4.0, 57.5, 31.5, 71.5]) assert "pop_total" in ds.data_vars @@ -181,9 +179,7 @@ def test_fetch_period_clears_encoding_except_time(self) -> None: fake_resp.raise_for_status = lambda: None fake_resp.content = b"" - with patch("requests.get", return_value=fake_resp), patch( - "rioxarray.open_rasterio", return_value=fake_da - ): + with patch("requests.get", return_value=fake_resp), patch("rioxarray.open_rasterio", return_value=fake_da): ds = WorldPopPlugin(country_code="NOR")._fetch_sync(2024, [4.0, 57.5, 31.5, 71.5]) assert ds["time"].encoding.get("units") == "days since 1970-01-01" From de92cb52e5fec12fa7ccb60fb8be0b4de0891ebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 23:09:35 +0200 Subject: [PATCH 19/80] fix: resolve all mypy and pyright lint errors Co-Authored-By: Claude Sonnet 4.6 --- climate_api/data_accessor/services/accessor.py | 2 +- climate_api/ingest/plugins/chirps3.py | 4 ++-- climate_api/ingest/plugins/worldpop.py | 4 ++-- climate_api/ingest/store.py | 2 +- climate_api/ingestions/services.py | 17 ++++++++++------- climate_api/system/routes.py | 2 -- tests/test_datasets_sync.py | 4 ++-- tests/test_ingest_orchestrator.py | 4 ++-- 8 files changed, 20 insertions(+), 19 deletions(-) diff --git a/climate_api/data_accessor/services/accessor.py b/climate_api/data_accessor/services/accessor.py index e2ac9786..85196682 100644 --- a/climate_api/data_accessor/services/accessor.py +++ b/climate_api/data_accessor/services/accessor.py @@ -131,7 +131,7 @@ def open_icechunk_dataset(store_path: str | Path) -> xr.Dataset: repo = open_or_create_repo(Path(store_path)) session = repo.readonly_session("main") - return xr.open_zarr(session.store) + return xr.open_zarr(session.store) # type: ignore[no-any-return] def _open_zarr(zarr_path: str) -> xr.Dataset: diff --git a/climate_api/ingest/plugins/chirps3.py b/climate_api/ingest/plugins/chirps3.py index ccba2095..97371e7f 100644 --- a/climate_api/ingest/plugins/chirps3.py +++ b/climate_api/ingest/plugins/chirps3.py @@ -104,6 +104,7 @@ def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: logger.info("Fetching CHIRPS3 %s: %s", period_id, url) da = rioxarray.open_rasterio(url, chunks=None, masked=True, lock=False) + assert isinstance(da, xr.DataArray) xmin, ymin, xmax, ymax = map(float, bbox) da = da.rio.clip_box(minx=xmin, miny=ymin, maxx=xmax, maxy=ymax) da = da.squeeze("band", drop=True) @@ -112,8 +113,7 @@ def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: da = da.load() ds = da.to_dataset(name="precip") - ds = ds.expand_dims(time=[np.datetime64(period_id, "D")]) - return ds + return ds.expand_dims(time=[np.datetime64(period_id, "D")]) # type: ignore[no-any-return] def _probe_estimate(self, bbox: list[float]) -> GridSpec: """Derive GridSpec from CHIRPS3's known 0.05° resolution.""" diff --git a/climate_api/ingest/plugins/worldpop.py b/climate_api/ingest/plugins/worldpop.py index 4aca4ed4..41281901 100644 --- a/climate_api/ingest/plugins/worldpop.py +++ b/climate_api/ingest/plugins/worldpop.py @@ -92,6 +92,7 @@ def _fetch_sync(self, year: int, bbox: list[float]) -> xr.Dataset: resp.raise_for_status() da = rioxarray.open_rasterio(io.BytesIO(resp.content)) + assert isinstance(da, xr.DataArray) xmin, ymin, xmax, ymax = map(float, bbox) da = da.rio.clip_box(minx=xmin, miny=ymin, maxx=xmax, maxy=ymax) da = da.squeeze("band", drop=True) @@ -102,8 +103,7 @@ def _fetch_sync(self, year: int, bbox: list[float]) -> xr.Dataset: da = da.load() ds = da.to_dataset(name="pop_total") - ds = ds.expand_dims(time=[np.datetime64(f"{year}-01-01", "D")]) - return ds + return ds.expand_dims(time=[np.datetime64(f"{year}-01-01", "D")]) # type: ignore[no-any-return] def _probe_estimate(self, bbox: list[float]) -> GridSpec: """Derive GridSpec from WorldPop's known 3 arc-second resolution.""" diff --git a/climate_api/ingest/store.py b/climate_api/ingest/store.py index d6c40233..0149a6ae 100644 --- a/climate_api/ingest/store.py +++ b/climate_api/ingest/store.py @@ -58,7 +58,7 @@ def rechunk_store(store_path: Path, *, time_chunk: int) -> None: else: new_chunks = [effective_chunk if dim == "time" else da.sizes[dim] for dim in da.dims] existing["chunks"] = new_chunks - encoding[name] = existing + encoding[name] = existing # pyright: ignore[reportArgumentType] write_session = repo.writable_session("main") ds.chunk({"time": effective_chunk}).to_zarr(write_session.store, mode="w", encoding=encoding) diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 06993aa5..b5a99ed8 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -13,6 +13,7 @@ import portalocker import pyproj +import xarray as xr from fastapi import HTTPException from fastapi.responses import FileResponse, JSONResponse from starlette.responses import Response @@ -73,7 +74,7 @@ def _check_bbox_overlap(dataset: dict[str, object], instance_bbox: list[float]) ) -def _read_crs_from_spatial_ref(ds: object) -> str | None: +def _read_crs_from_spatial_ref(ds: xr.Dataset) -> str | None: """Return 'EPSG:' from a dataset's spatial_ref coordinate, or None.""" if "spatial_ref" not in ds.coords: return None @@ -271,9 +272,11 @@ def _create_icechunk_artifact( dataset_id = str(dataset["id"]) period_type = str(dataset["period_type"]) - ingestion = dict(dataset.get("ingestion") or {}) # type: ignore[arg-type] + _raw_ingestion = dataset.get("ingestion") + ingestion: dict[str, object] = dict(_raw_ingestion) if isinstance(_raw_ingestion, dict) else {} plugin_path = str(ingestion["plugin"]) - params = dict(ingestion.get("params") or {}) + _raw_params = ingestion.get("params") + params: dict[str, object] = dict(_raw_params) if isinstance(_raw_params, dict) else {} extent = get_extent() resolved_bbox: list[float] = ( @@ -545,14 +548,14 @@ def _icechunk_store_info(dataset_id: str, artifact: ArtifactRecord) -> dict[str, store_attrs: dict[str, object] = {} try: root_meta = json.loads(bytes(session.store["zarr.json"])) # type: ignore[index] - store_attrs = root_meta.get("attributes", {}) # type: ignore[assignment] + store_attrs = root_meta.get("attributes", {}) except Exception: pass store_crs = store_attrs.get("proj:code") crs = store_crs if isinstance(store_crs, str) and store_crs else api_config.get_crs() - root: zarr.Group = zarr.open_group(session.store, mode="r") # type: ignore[assignment] + root: zarr.Group = zarr.open_group(session.store, mode="r") entries = [ { "name": name, @@ -660,7 +663,7 @@ def _serve_icechunk_key(dataset_id: str, artifact: ArtifactRecord, relative_path # Directory-like paths: list child keys as a ZarrListing if not key or key.endswith("/"): - root: zarr.Group = zarr.open_group(session.store, mode="r") # type: ignore[assignment] + root: zarr.Group = zarr.open_group(session.store, mode="r") prefix = key.rstrip("/") try: node: zarr.Group = root[prefix] if prefix else root # type: ignore[assignment] @@ -686,7 +689,7 @@ def _serve_icechunk_key(dataset_id: str, artifact: ArtifactRecord, relative_path proto = zarr.core.buffer.default_buffer_prototype() data = session.store._get_bytes_sync(key, prototype=proto) - if data is None: + if data is None: # pyright: ignore[reportUnnecessaryComparison] raise HTTPException(status_code=404, detail=f"Zarr key '{relative_path}' not found in store") except (KeyError, FileNotFoundError): raise HTTPException(status_code=404, detail=f"Zarr key '{relative_path}' not found in store") diff --git a/climate_api/system/routes.py b/climate_api/system/routes.py index 3ac2d93f..6298dd65 100644 --- a/climate_api/system/routes.py +++ b/climate_api/system/routes.py @@ -66,14 +66,12 @@ async def manage_ingest(request: Request) -> RedirectResponse: extent = get_extent_or_404() resolved_bbox = list(extent["bbox"]) - country_code = extent.get("country_code") create_artifact( dataset=template, start=start, end=end, bbox=resolved_bbox, - country_code=country_code, overwrite=overwrite, prefer_zarr=True, publish=publish, diff --git a/tests/test_datasets_sync.py b/tests/test_datasets_sync.py index 25d71f36..78d59aab 100644 --- a/tests/test_datasets_sync.py +++ b/tests/test_datasets_sync.py @@ -1087,7 +1087,7 @@ def test_create_icechunk_artifact_uses_ingest_start_for_delta_efficiency( } svc._create_icechunk_artifact( - dataset=dataset, + dataset=dataset, # type: ignore[arg-type] start="2024-01-01T00", end="2024-01-01T06", bbox=None, @@ -1117,7 +1117,7 @@ def test_create_icechunk_artifact_uses_full_start_when_no_ingest_start( } svc._create_icechunk_artifact( - dataset=dataset, + dataset=dataset, # type: ignore[arg-type] start="2024-01-01T00", end="2024-01-01T06", bbox=None, diff --git a/tests/test_ingest_orchestrator.py b/tests/test_ingest_orchestrator.py index 54d66ba5..adb99ca8 100644 --- a/tests/test_ingest_orchestrator.py +++ b/tests/test_ingest_orchestrator.py @@ -377,9 +377,9 @@ def _time_chunk_size(store_path: Path) -> int: g = zarr.open_group(session.store, mode="r") for name in g.array_keys(): arr = g[name] - dims = list(arr.metadata.dimension_names or []) + dims = list(arr.metadata.dimension_names or []) # type: ignore[union-attr] if "time" in dims: - return arr.chunks[dims.index("time")] + return arr.chunks[dims.index("time")] # type: ignore[union-attr] raise AssertionError("No array with a time dimension found") From 254a02c263b3c28f9a2af24686f76ec245b01435 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 23:12:04 +0200 Subject: [PATCH 20/80] fix: treat empty icechunk skeleton as first write in orchestrator When a previous ingest attempt fails before writing any data, the store directory exists as an empty skeleton. is_first_write was False because the path existed, causing append_dim='time' to fail on the empty store. Base is_first_write on done_offset == 0 instead, which correctly handles both new stores and empty-skeleton stores from failed initialisations. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/orchestrator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py index d8e88f6b..8ab1258d 100644 --- a/climate_api/ingest/orchestrator.py +++ b/climate_api/ingest/orchestrator.py @@ -134,7 +134,10 @@ async def run_ingest( if on_progress: on_progress(done=done_offset, total=len(all_periods), message=f"{len(pending)} periods pending") - is_first_write = not store_path.exists() + # True when no periods have been committed yet — handles both a brand-new + # store and a store directory that exists as an empty skeleton from a + # previous failed initialisation (where append_dim would fail on an empty store). + is_first_write = done_offset == 0 repo = open_or_create_repo(store_path) semaphore = asyncio.Semaphore(plugin.max_concurrency) From d7969bd294fbb24377f928d43feb0beee7f16145 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 23:34:41 +0200 Subject: [PATCH 21/80] fix: update tests and guard _detect_dataset_crs after encoding/nodata refactors - WorldPop nodata changed from 0.0 to NaN: update assertion to math.isnan() - CF encoding moved to orchestrator: plugin encoding tests now assert on dataset structure (time dim present) instead of time encoding being set - _detect_dataset_crs: guard against objects without .coords so STAC tests using lightweight DummyDataset stubs don't crash with AttributeError Co-Authored-By: Claude Sonnet 4.6 --- climate_api/stac/services.py | 2 +- tests/test_ingest_plugins.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index 80b2900d..03177786 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -500,7 +500,7 @@ def _detect_dataset_crs(ds: Any) -> str | None: the deployment-wide proj:code with the actual native CRS of the data so that datasets stored in WGS84 (e.g. WorldPop) are not misidentified as projected. """ - if "spatial_ref" not in ds.coords: + if not hasattr(ds, "coords") or "spatial_ref" not in ds.coords: return None try: import pyproj diff --git a/tests/test_ingest_plugins.py b/tests/test_ingest_plugins.py index f92ef1bb..682bf78b 100644 --- a/tests/test_ingest_plugins.py +++ b/tests/test_ingest_plugins.py @@ -8,6 +8,7 @@ from __future__ import annotations import asyncio +import math from datetime import date from typing import Any from unittest.mock import MagicMock, patch @@ -119,7 +120,7 @@ def test_probe_estimate_returns_gridspec(self) -> None: assert spec.crs == 4326 assert spec.time_dim is True assert spec.dtype == np.dtype("float32") - assert spec.nodata == 0.0 + assert math.isnan(spec.nodata) assert spec.shape[0] > 0 and spec.shape[1] > 0 def test_probe_estimate_shape_proportional_to_bbox(self) -> None: @@ -171,7 +172,7 @@ def test_fetch_period_returns_dataset_with_time_and_pop_total(self) -> None: time_val = pd.Timestamp(ds["time"].values[0]) assert time_val.year == 2024 - def test_fetch_period_clears_encoding_except_time(self) -> None: + def test_fetch_period_returns_dataset_with_time_dim(self) -> None: from climate_api.ingest.plugins.worldpop import WorldPopPlugin fake_da = self._make_fake_da() @@ -182,7 +183,9 @@ def test_fetch_period_clears_encoding_except_time(self) -> None: with patch("requests.get", return_value=fake_resp), patch("rioxarray.open_rasterio", return_value=fake_da): ds = WorldPopPlugin(country_code="NOR")._fetch_sync(2024, [4.0, 57.5, 31.5, 71.5]) - assert ds["time"].encoding.get("units") == "days since 1970-01-01" + assert "time" in ds.dims + assert ds.sizes["time"] == 1 + # Encoding is intentionally left unset — the orchestrator's _strip_cf_encoding handles it. # --------------------------------------------------------------------------- @@ -386,11 +389,13 @@ def test_fetch_period_masks_nodata_as_nan(self) -> None: assert np.isnan(precip).any(), "nodata pixels should be NaN" assert not np.isnan(precip).all(), "non-nodata pixels should be finite" - def test_fetch_period_time_encoding_pinned(self) -> None: + def test_fetch_period_returns_dataset_with_time_dim(self) -> None: from climate_api.ingest.plugins.chirps3 import Chirps3Plugin fake_da = self._make_fake_chirps_da() with patch("rioxarray.open_rasterio", return_value=fake_da): ds = Chirps3Plugin()._fetch_sync("2024-03-15", [-5.0, 5.0, 5.0, 10.0]) - assert ds["time"].encoding.get("units") == "days since 1970-01-01" + assert "time" in ds.dims + assert ds.sizes["time"] == 1 + # Encoding is intentionally left unset — the orchestrator's _strip_cf_encoding handles it. From f5ae74de2345b9388a4913a2b12121b1e1ddf346 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 23:36:59 +0200 Subject: [PATCH 22/80] fix: guard math.isnan call against None for pyright Co-Authored-By: Claude Sonnet 4.6 --- tests/test_ingest_plugins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_ingest_plugins.py b/tests/test_ingest_plugins.py index 682bf78b..fc9d435d 100644 --- a/tests/test_ingest_plugins.py +++ b/tests/test_ingest_plugins.py @@ -120,7 +120,7 @@ def test_probe_estimate_returns_gridspec(self) -> None: assert spec.crs == 4326 assert spec.time_dim is True assert spec.dtype == np.dtype("float32") - assert math.isnan(spec.nodata) + assert spec.nodata is not None and math.isnan(spec.nodata) assert spec.shape[0] > 0 and spec.shape[1] > 0 def test_probe_estimate_shape_proportional_to_bbox(self) -> None: From f09c870765842c488bdeaad07af3dcb991d34ded Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Tue, 19 May 2026 23:52:51 +0200 Subject: [PATCH 23/80] =?UTF-8?q?fix:=20address=20Copilot=20PR=20review=20?= =?UTF-8?q?=E2=80=94=20resource=20leaks,=20prefer=5Fzarr,=20GridSpec,=20pl?= =?UTF-8?q?ugin=20errors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - store.py: close xarray datasets in rechunk_store and read_committed_period_ids via try/finally to prevent file handle / thread leaks on all code paths - accessor.py: open_icechunk_dataset uses Repository.open() directly instead of open_or_create_repo() — raises FileNotFoundError rather than silently creating an empty store on a read-only code path - protocol.py: fix commit_batch_size docstring — controls cursor checkpoint frequency, not Icechunk commit frequency (every period is always committed) - services.py: prefer_zarr reuse check now accepts ICECHUNK alongside ZARR so sync and resample calls can reuse existing Icechunk artifacts - services.py: catch TypeError from load_plugin and re-raise as HTTP 400 so missing required plugin params (e.g. WorldPop country_code) surface clearly - era5_land.py: GridSpec x_dim/y_dim corrected to "x"/"y" matching the renamed dims that fetch_period actually returns Co-Authored-By: Claude Sonnet 4.6 --- .../data_accessor/services/accessor.py | 8 ++- climate_api/ingest/plugins/era5_land.py | 4 +- climate_api/ingest/protocol.py | 9 +-- climate_api/ingest/store.py | 63 ++++++++++--------- climate_api/ingestions/services.py | 7 ++- 5 files changed, 52 insertions(+), 39 deletions(-) diff --git a/climate_api/data_accessor/services/accessor.py b/climate_api/data_accessor/services/accessor.py index 85196682..9d4faa92 100644 --- a/climate_api/data_accessor/services/accessor.py +++ b/climate_api/data_accessor/services/accessor.py @@ -127,9 +127,13 @@ def open_zarr_dataset(zarr_path: str) -> xr.Dataset: def open_icechunk_dataset(store_path: str | Path) -> xr.Dataset: """Open an Icechunk store as an xarray Dataset via a readonly MVCC session.""" - from climate_api.ingest.store import open_or_create_repo + import icechunk - repo = open_or_create_repo(Path(store_path)) + path = Path(store_path) + if not path.exists(): + raise FileNotFoundError(f"Icechunk store not found: {path}") + storage = icechunk.local_filesystem_storage(str(path)) + repo = icechunk.Repository.open(storage) session = repo.readonly_session("main") return xr.open_zarr(session.store) # type: ignore[no-any-return] diff --git a/climate_api/ingest/plugins/era5_land.py b/climate_api/ingest/plugins/era5_land.py index 5c3a905f..37a5429f 100644 --- a/climate_api/ingest/plugins/era5_land.py +++ b/climate_api/ingest/plugins/era5_land.py @@ -103,8 +103,8 @@ def _probe_sync(self, bbox: list[float]) -> GridSpec: dtype=np.dtype(da.dtype), nodata=None, time_dim=True, - x_dim="longitude", - y_dim="latitude", + x_dim="x", + y_dim="y", ) def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: diff --git a/climate_api/ingest/protocol.py b/climate_api/ingest/protocol.py index 8f5e109b..5afb4798 100644 --- a/climate_api/ingest/protocol.py +++ b/climate_api/ingest/protocol.py @@ -49,10 +49,11 @@ class IngestionPlugin(Protocol): Keep at 1 for sources with large per-period files or rate-limited APIs. Raise for sources where individual periods are small (< 50 MB). - commit_batch_size: number of periods written between Icechunk commits. - Use 1 for monthly sources. For daily sources use ~30; for hourly ~720. - This controls crash-recovery granularity, not peak memory — to_zarr - flushes each period immediately. + commit_batch_size: how often the job cursor checkpoint is saved. + Every period is always committed individually to Icechunk; this + controls how frequently the orchestrator persists the cursor so that a + restart resumes from the last checkpoint rather than re-scanning the + store. Use 1 for monthly sources, ~30 for daily, ~720 for hourly. """ max_concurrency: int diff --git a/climate_api/ingest/store.py b/climate_api/ingest/store.py index 0149a6ae..234c486c 100644 --- a/climate_api/ingest/store.py +++ b/climate_api/ingest/store.py @@ -40,30 +40,32 @@ def rechunk_store(store_path: Path, *, time_chunk: int) -> None: repo = open_or_create_repo(store_path) read_session = repo.readonly_session("main") ds = xr.open_zarr(read_session.store) - - n_times = ds.sizes.get("time", 0) - if n_times == 0: - return - - effective_chunk = min(time_chunk, n_times) - encoding: dict[str, dict] = {} - for name in list(ds.data_vars) + list(ds.coords): - da = ds[name] - existing = dict(da.encoding) - if "time" in da.dims: - current = existing.get("chunks") - if isinstance(current, (list, tuple)): - new_chunks = list(current) - new_chunks[list(da.dims).index("time")] = effective_chunk - else: - new_chunks = [effective_chunk if dim == "time" else da.sizes[dim] for dim in da.dims] - existing["chunks"] = new_chunks - encoding[name] = existing # pyright: ignore[reportArgumentType] - - write_session = repo.writable_session("main") - ds.chunk({"time": effective_chunk}).to_zarr(write_session.store, mode="w", encoding=encoding) - write_session.commit(f"rechunk: time={effective_chunk}") - logger.info("Rechunked %s: time chunk → %d (%d periods)", store_path, effective_chunk, n_times) + try: + n_times = ds.sizes.get("time", 0) + if n_times == 0: + return + + effective_chunk = min(time_chunk, n_times) + encoding: dict[str, dict] = {} + for name in list(ds.data_vars) + list(ds.coords): + da = ds[name] + existing = dict(da.encoding) + if "time" in da.dims: + current = existing.get("chunks") + if isinstance(current, (list, tuple)): + new_chunks = list(current) + new_chunks[list(da.dims).index("time")] = effective_chunk + else: + new_chunks = [effective_chunk if dim == "time" else da.sizes[dim] for dim in da.dims] + existing["chunks"] = new_chunks + encoding[name] = existing # pyright: ignore[reportArgumentType] + + write_session = repo.writable_session("main") + ds.chunk({"time": effective_chunk}).to_zarr(write_session.store, mode="w", encoding=encoding) + write_session.commit(f"rechunk: time={effective_chunk}") + logger.info("Rechunked %s: time chunk → %d (%d periods)", store_path, effective_chunk, n_times) + finally: + ds.close() def read_committed_period_ids(store_path: Path, period_type: str) -> set[str]: @@ -84,11 +86,14 @@ def read_committed_period_ids(store_path: Path, period_type: str) -> set[str]: repo = open_or_create_repo(store_path) session = repo.readonly_session("main") ds = xr.open_zarr(session.store) - if "time" not in ds.coords: - return set() - import pandas as pd - - return {datetime_to_period_string(pd.Timestamp(t.item()).to_pydatetime(), period_type) for t in ds.time} + try: + if "time" not in ds.coords: + return set() + import pandas as pd + + return {datetime_to_period_string(pd.Timestamp(t.item()).to_pydatetime(), period_type) for t in ds.time} + finally: + ds.close() except Exception: logger.debug("Could not read committed periods from %s", store_path, exc_info=True) return set() diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index b5a99ed8..798cacca 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -289,7 +289,10 @@ def _create_icechunk_artifact( extra_params: dict[str, object] = {} if extent_country_code: extra_params["country_code"] = extent_country_code - plugin = load_plugin(plugin_path, params, extra_params=extra_params or None) + try: + plugin = load_plugin(plugin_path, params, extra_params=extra_params or None) + except TypeError as exc: + raise HTTPException(status_code=400, detail=f"Plugin configuration error: {exc}") from exc effective_start = ingest_start if ingest_start is not None else start # Rechunk after the initial ingest (when no delta start is provided) using the @@ -940,7 +943,7 @@ def _find_existing_artifact_in_records( record.request_scope.end, ) continue - if prefer_zarr and record.format != ArtifactFormat.ZARR: + if prefer_zarr and record.format not in (ArtifactFormat.ZARR, ArtifactFormat.ICECHUNK): continue return record return None From 9e7c85eff809191111b8e2b62e5fd6e8532d0000 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 00:00:35 +0200 Subject: [PATCH 24/80] fix: strip CF attrs from zarr encoding in rechunk_store xarray copies CF convention keys (coordinates, scale_factor, etc.) from zarr metadata into .encoding when reading back a stored dataset. Passing these back to to_zarr() raises ValueError: unexpected encoding parameters. Filter them out before building the encoding dict for the rechunk write. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/store.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/climate_api/ingest/store.py b/climate_api/ingest/store.py index 234c486c..b3a38540 100644 --- a/climate_api/ingest/store.py +++ b/climate_api/ingest/store.py @@ -46,10 +46,14 @@ def rechunk_store(store_path: Path, *, time_chunk: int) -> None: return effective_chunk = min(time_chunk, n_times) + # Keys that are CF conventions attrs, not valid zarr encoding parameters. + # xarray copies them into .encoding when reading from zarr; strip them + # before passing back to to_zarr() to avoid ValueError. + _INVALID_ZARR_KEYS = frozenset({"scale_factor", "add_offset", "missing_value", "_FillValue", "coordinates"}) encoding: dict[str, dict] = {} for name in list(ds.data_vars) + list(ds.coords): da = ds[name] - existing = dict(da.encoding) + existing = {k: v for k, v in da.encoding.items() if k not in _INVALID_ZARR_KEYS} if "time" in da.dims: current = existing.get("chunks") if isinstance(current, (list, tuple)): From 0a129e18d61200e4c9f9e83e5ff2ea58c6ce80e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 00:06:06 +0200 Subject: [PATCH 25/80] =?UTF-8?q?fix:=20address=20second=20Copilot=20PR=20?= =?UTF-8?q?review=20=E2=80=94=20task=20leak,=20subscript,=20buffer,=20empt?= =?UTF-8?q?y=20store,=20bbox?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - orchestrator.py: cancel remaining tasks when breaking early for time_dim=False static datasets (previously left asyncio tasks running/pending in background) - services.py: _icechunk_store_info reads root attrs via zarr.open_group().attrs instead of session.store["zarr.json"] — IcechunkStore is not subscriptable - services.py: convert _get_bytes_sync result to bytes() explicitly before json.loads / Response — zarr Buffer is not always bytes across versions - services.py: guard _create_icechunk_artifact against empty/missing store after run_ingest_sync returns — raises HTTP 409 rather than crashing with a zarr error - services.py: _check_bbox_overlap catches TypeError/ValueError from float() conversion of malformed dataset bbox values instead of raising 500 - tests: fake_run_ingest_sync creates the store directory so the new existence check in _create_icechunk_artifact passes in unit tests Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/orchestrator.py | 2 ++ climate_api/ingestions/services.py | 28 ++++++++++++++++------------ tests/test_datasets_sync.py | 1 + 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py index 8ab1258d..9f3e11e2 100644 --- a/climate_api/ingest/orchestrator.py +++ b/climate_api/ingest/orchestrator.py @@ -191,6 +191,8 @@ async def _fetch(period_id: str) -> xr.Dataset: on_progress(done=done_offset + i + 1, total=len(all_periods), message=f"Wrote {period_id}") if not spec.time_dim: + for t in tasks[i + 1 :]: + t.cancel() break if rechunk_time is not None and spec.time_dim: diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 798cacca..6f451abb 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -62,7 +62,10 @@ def _check_bbox_overlap(dataset: dict[str, object], instance_bbox: list[float]) dataset_bbox = spatial.get("bbox") if not (isinstance(dataset_bbox, list) and len(dataset_bbox) == 4): return - dx_min, dy_min, dx_max, dy_max = (float(v) for v in dataset_bbox) + try: + dx_min, dy_min, dx_max, dy_max = (float(v) for v in dataset_bbox) + except (TypeError, ValueError): + return # malformed dataset bbox — skip overlap check ix_min, iy_min, ix_max, iy_max = (float(v) for v in instance_bbox) if dx_max <= ix_min or dx_min >= ix_max or dy_max <= iy_min or dy_min >= iy_max: raise HTTPException( @@ -319,11 +322,17 @@ def _create_icechunk_artifact( rechunk_time=rechunk_time, ) + if not store_path.exists(): + raise HTTPException(status_code=409, detail="Plugin returned no periods for the requested range") + repo = open_or_create_repo(store_path) session = repo.readonly_session("main") import xarray as xr - ds = xr.open_zarr(session.store) + try: + ds = xr.open_zarr(session.store) + except Exception as exc: + raise HTTPException(status_code=409, detail="Ingest produced no readable data for the requested range") from exc from climate_api import config as api_config native_crs = _read_crs_from_spatial_ref(ds) or api_config.get_crs() or "EPSG:4326" @@ -548,17 +557,11 @@ def _icechunk_store_info(dataset_id: str, artifact: ArtifactRecord) -> dict[str, repo = open_or_create_repo(store_path) session = repo.readonly_session("main") - store_attrs: dict[str, object] = {} - try: - root_meta = json.loads(bytes(session.store["zarr.json"])) # type: ignore[index] - store_attrs = root_meta.get("attributes", {}) - except Exception: - pass + root: zarr.Group = zarr.open_group(session.store, mode="r") + store_attrs: dict[str, object] = dict(root.attrs) store_crs = store_attrs.get("proj:code") crs = store_crs if isinstance(store_crs, str) and store_crs else api_config.get_crs() - - root: zarr.Group = zarr.open_group(session.store, mode="r") entries = [ { "name": name, @@ -697,9 +700,10 @@ def _serve_icechunk_key(dataset_id: str, artifact: ArtifactRecord, relative_path except (KeyError, FileNotFoundError): raise HTTPException(status_code=404, detail=f"Zarr key '{relative_path}' not found in store") + raw = bytes(data) if key.endswith("zarr.json"): - return JSONResponse(content=json.loads(data)) - return Response(content=data, media_type="application/octet-stream") + return JSONResponse(content=json.loads(raw)) + return Response(content=raw, media_type="application/octet-stream") def _load_records() -> list[ArtifactRecord]: diff --git a/tests/test_datasets_sync.py b/tests/test_datasets_sync.py index 78d59aab..63654784 100644 --- a/tests/test_datasets_sync.py +++ b/tests/test_datasets_sync.py @@ -1035,6 +1035,7 @@ def _patch_icechunk_artifact_dependencies( def fake_run_ingest_sync(**kwargs: object) -> None: captured.update(kwargs) + Path(str(kwargs["store_path"])).mkdir(exist_ok=True) monkeypatch.setattr(orchestrator_mod, "run_ingest_sync", fake_run_ingest_sync) monkeypatch.setattr(orchestrator_mod, "load_plugin", lambda path, params, extra_params=None: object()) From 296e0f6d0de811712e294c50f483d7bc9f96b7a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 00:18:20 +0200 Subject: [PATCH 26/80] =?UTF-8?q?fix:=20address=20third=20Copilot=20PR=20r?= =?UTF-8?q?eview=20=E2=80=94=20cursor=20safety,=20overwrite,=20ds.close,?= =?UTF-8?q?=20request=5Fscope?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - orchestrator.py: cursor resume now requires store_path.exists() before trusting last_committed — a deleted store with a stale cursor previously caused the first write to use append_dim on an empty repo, crashing with a zarr error - services.py: overwrite=True now deletes the existing icechunk store before run_ingest_sync so the ingest is actually fresh rather than a no-op (all previously committed periods were skipped by the resume logic) - services.py: coverage_from_open_dataset wrapped in try/finally to guarantee ds.close() runs even when coverage calculation raises - services.py: request_scope.end is normalised to coverage.temporal.end when a plugin clamps availability below the requested end (e.g. CHIRPS3 lag) — fixes _artifact_coverage_matches_request_scope always returning False and causing unnecessary re-ingest on every subsequent sync call Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/orchestrator.py | 2 +- climate_api/ingestions/services.py | 24 ++++++++++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py index 9f3e11e2..e8ee2760 100644 --- a/climate_api/ingest/orchestrator.py +++ b/climate_api/ingest/orchestrator.py @@ -116,7 +116,7 @@ async def run_ingest( cursor = load_cursor() if load_cursor else None last_committed: str | None = cursor.get("last_committed") if cursor else None - if last_committed and last_committed in all_periods: + if last_committed and last_committed in all_periods and store_path.exists(): idx = all_periods.index(last_committed) + 1 pending = all_periods[idx:] logger.info("Resuming after %s: %d/%d periods remain", last_committed, len(pending), len(all_periods)) diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 6f451abb..1606f769 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -288,6 +288,12 @@ def _create_icechunk_artifact( _check_bbox_overlap(dataset, resolved_bbox) store_path = downloader.DOWNLOAD_DIR / f"{dataset_id}.icechunk" + if overwrite and store_path.exists(): + import shutil + + shutil.rmtree(store_path) + logger.info("Cleared existing store for overwrite: %s", store_path) + extent_country_code = extent.get("country_code") if extent else None extra_params: dict[str, object] = {} if extent_country_code: @@ -336,8 +342,10 @@ def _create_icechunk_artifact( from climate_api import config as api_config native_crs = _read_crs_from_spatial_ref(ds) or api_config.get_crs() or "EPSG:4326" - coverage_data = coverage_from_open_dataset(ds, period_type=period_type, native_crs=native_crs) - ds.close() + try: + coverage_data = coverage_from_open_dataset(ds, period_type=period_type, native_crs=native_crs) + finally: + ds.close() if not coverage_data.get("has_data", True): raise HTTPException(status_code=409, detail="Icechunk store contains no data for the requested scope") @@ -349,6 +357,18 @@ def _create_icechunk_artifact( spatial_wgs84=CoverageSpatial(**_spatial_wgs84_data) if _spatial_wgs84_data else None, ) + # When a plugin clamps availability (e.g. CHIRPS3 has a 3-month lag), the + # realized coverage end is earlier than the requested end. Normalise the + # stored request_scope to the actual coverage end so that + # _artifact_coverage_matches_request_scope passes on future requests for the + # same realized range, instead of triggering an unnecessary re-ingest. + if request_scope.end is not None and coverage.temporal.end != request_scope.end: + request_scope = ArtifactRequestScope( + start=request_scope.start, + end=coverage.temporal.end, + bbox=request_scope.bbox, + ) + record = ArtifactRecord( artifact_id=str(uuid4()), dataset_id=dataset_id, From 31bd1219be9957d8ef093bc863109cbde2b86da7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 00:39:15 +0200 Subject: [PATCH 27/80] fix: transforms, expire_snapshots timing, sync duplicates, and protocol cleanup - Apply YAML-declared transforms (kelvin_to_celsius, metres_to_mm) in the plugin ingest path by threading apply_transforms through run_ingest / run_ingest_sync and calling it from _create_icechunk_artifact - Capture ingest_started_at before the period loop so expire_snapshots only marks snapshots created during this run, not the pre-existing HEAD - Fix sync appends accumulating duplicate artifact records: add _upsert_icechunk_artifact_record that matches by dataset_id + path (not request_scope) so delta appends update in place - Add rechunk_time: int | None to IngestionPlugin protocol; add rechunk_time = None to WorldPopPlugin and FakePlugin to satisfy runtime_checkable check - Replace assert isinstance with explicit TypeError raises in chirps3 and worldpop - Remove duplicate encoding clear from Era5LandPlugin._fetch_sync (orchestrator's _strip_cf_encoding already handles it) - Add test for time_dim=False static dataset branch and apply_transforms callback - Add comment on _get_bytes_sync private API usage - Update docs/architecture.md: replace stale "function-path" term Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/orchestrator.py | 10 ++- climate_api/ingest/plugins/chirps3.py | 3 +- climate_api/ingest/plugins/era5_land.py | 9 --- climate_api/ingest/plugins/worldpop.py | 4 +- climate_api/ingest/protocol.py | 7 +++ climate_api/ingestions/services.py | 32 +++++++++- docs/architecture.md | 2 +- tests/test_datasets_sync.py | 1 + tests/test_ingest_orchestrator.py | 82 +++++++++++++++++++++++++ 9 files changed, 136 insertions(+), 14 deletions(-) diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py index e8ee2760..4365973f 100644 --- a/climate_api/ingest/orchestrator.py +++ b/climate_api/ingest/orchestrator.py @@ -93,6 +93,7 @@ async def run_ingest( save_cursor: Callable[[dict[str, Any]], None] | None = None, load_cursor: Callable[[], dict[str, Any] | None] | None = None, rechunk_time: int | None = None, + apply_transforms: Callable[[xr.Dataset], xr.Dataset] | None = None, ) -> None: """Probe the source then stream per-period data into an Icechunk store. @@ -138,6 +139,9 @@ async def run_ingest( # store and a store directory that exists as an empty skeleton from a # previous failed initialisation (where append_dim would fail on an empty store). is_first_write = done_offset == 0 + # Capture before any commits so expire_snapshots only marks snapshots that + # were created during this run, not the pre-existing HEAD. + ingest_started_at = datetime.now(tz=timezone.utc) repo = open_or_create_repo(store_path) semaphore = asyncio.Semaphore(plugin.max_concurrency) @@ -160,6 +164,8 @@ async def _fetch(period_id: str) -> xr.Dataset: ds = await task period_id = pending[i] + if apply_transforms is not None: + ds = apply_transforms(ds) _strip_cf_encoding(ds, period_type=period_type) # Each period uses its own writable session so that to_zarr(append_dim=) @@ -207,7 +213,7 @@ async def _fetch(period_id: str) -> xr.Dataset: # data — garbage_collect would be needed to reclaim manifest storage. # The "main" branch ref preserves HEAD even when it appears in the expired set. try: - expired = repo.expire_snapshots(older_than=datetime.now(tz=timezone.utc)) + expired = repo.expire_snapshots(older_than=ingest_started_at) if expired: logger.info("Expired %d intermediate snapshots from %s", len(expired), store_path) except Exception: @@ -228,6 +234,7 @@ def run_ingest_sync( save_cursor: Callable[[dict[str, Any]], None] | None = None, load_cursor: Callable[[], dict[str, Any] | None] | None = None, rechunk_time: int | None = None, + apply_transforms: Callable[[xr.Dataset], xr.Dataset] | None = None, ) -> None: """Synchronous wrapper around run_ingest for use in threaded job workers.""" asyncio.run( @@ -244,5 +251,6 @@ def run_ingest_sync( save_cursor=save_cursor, load_cursor=load_cursor, rechunk_time=rechunk_time, + apply_transforms=apply_transforms, ) ) diff --git a/climate_api/ingest/plugins/chirps3.py b/climate_api/ingest/plugins/chirps3.py index 97371e7f..cc76d61a 100644 --- a/climate_api/ingest/plugins/chirps3.py +++ b/climate_api/ingest/plugins/chirps3.py @@ -104,7 +104,8 @@ def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: logger.info("Fetching CHIRPS3 %s: %s", period_id, url) da = rioxarray.open_rasterio(url, chunks=None, masked=True, lock=False) - assert isinstance(da, xr.DataArray) + if not isinstance(da, xr.DataArray): + raise TypeError(f"rioxarray.open_rasterio returned {type(da).__name__!r}, expected DataArray") xmin, ymin, xmax, ymax = map(float, bbox) da = da.rio.clip_box(minx=xmin, miny=ymin, maxx=xmax, maxy=ymax) da = da.squeeze("band", drop=True) diff --git a/climate_api/ingest/plugins/era5_land.py b/climate_api/ingest/plugins/era5_land.py index 37a5429f..15a14187 100644 --- a/climate_api/ingest/plugins/era5_land.py +++ b/climate_api/ingest/plugins/era5_land.py @@ -125,15 +125,6 @@ def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: ds = ds.rename({"longitude": "x", "latitude": "y"}) ds = ds.load() - # Strip zarr v2 codec encoding (Blosc) so the orchestrator writes - # with zarr v3-compatible defaults into the icechunk store. - for name in list(ds.data_vars) + list(ds.coords): - ds[name].encoding.clear() - # Pin time to a stable hourly unit so every period append uses the - # same encoding. Without this, the first write picks "days since …" - # and sub-daily values on subsequent appends land on the wrong hour. - if "time" in ds.coords: - ds["time"].encoding.update({"units": "hours since 1970-01-01", "dtype": "int64"}) return ds # ------------------------------------------------------------------ diff --git a/climate_api/ingest/plugins/worldpop.py b/climate_api/ingest/plugins/worldpop.py index 41281901..4d95b3d2 100644 --- a/climate_api/ingest/plugins/worldpop.py +++ b/climate_api/ingest/plugins/worldpop.py @@ -45,6 +45,7 @@ class WorldPopPlugin: max_concurrency = 1 commit_batch_size = 1 + rechunk_time: int | None = None def __init__(self, country_code: str, version: str = "global2") -> None: self.country_code = country_code.upper() @@ -92,7 +93,8 @@ def _fetch_sync(self, year: int, bbox: list[float]) -> xr.Dataset: resp.raise_for_status() da = rioxarray.open_rasterio(io.BytesIO(resp.content)) - assert isinstance(da, xr.DataArray) + if not isinstance(da, xr.DataArray): + raise TypeError(f"rioxarray.open_rasterio returned {type(da).__name__!r}, expected DataArray") xmin, ymin, xmax, ymax = map(float, bbox) da = da.rio.clip_box(minx=xmin, miny=ymin, maxx=xmax, maxy=ymax) da = da.squeeze("band", drop=True) diff --git a/climate_api/ingest/protocol.py b/climate_api/ingest/protocol.py index 5afb4798..d5c6e16e 100644 --- a/climate_api/ingest/protocol.py +++ b/climate_api/ingest/protocol.py @@ -54,10 +54,17 @@ class IngestionPlugin(Protocol): controls how frequently the orchestrator persists the cursor so that a restart resumes from the last checkpoint rather than re-scanning the store. Use 1 for monthly sources, ~30 for daily, ~720 for hourly. + + rechunk_time: optional target time chunk size for the post-ingest rechunk. + When set, the orchestrator rewrites the store after all periods are + committed so the time axis uses chunks of this size instead of the + per-period chunk-of-1. Omit (or set to None) to skip rechunking. + Typical values: 30 for daily, 720 for hourly. """ max_concurrency: int commit_batch_size: int + rechunk_time: int | None async def probe(self, bbox: list[float], **params: Any) -> GridSpec: """Metadata-only source probe. Returns grid spec. No data transfer.""" diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 1606f769..d6fce283 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -317,6 +317,8 @@ def _create_icechunk_artifact( end, rechunk_time, ) + transforms = dataset.get("transforms") + apply_transforms = (lambda ds: downloader._run_transforms(ds, dataset)) if transforms else None run_ingest_sync( plugin=plugin, params=params, @@ -326,6 +328,7 @@ def _create_icechunk_artifact( store_path=store_path, period_type=period_type, rechunk_time=rechunk_time, + apply_transforms=apply_transforms, ) if not store_path.exists(): @@ -383,7 +386,7 @@ def _create_icechunk_artifact( created_at=datetime.now(UTC), publication=ArtifactPublication(), ) - stored = _upsert_artifact_record(record, prefer_zarr=False, publish=publish, overwrite=overwrite) + stored = _upsert_icechunk_artifact_record(record, publish=publish) logger.info( "Stored Icechunk artifact '%s' for '%s': coverage=%s..%s", stored.artifact_id, @@ -714,6 +717,8 @@ def _serve_icechunk_key(dataset_id: str, artifact: ArtifactRecord, relative_path import zarr.core.buffer proto = zarr.core.buffer.default_buffer_prototype() + # IcechunkStore does not expose a public synchronous read method; _get_bytes_sync + # is the internal synchronous accessor used by zarr's own blocking read path. data = session.store._get_bytes_sync(key, prototype=proto) if data is None: # pyright: ignore[reportUnnecessaryComparison] raise HTTPException(status_code=404, detail=f"Zarr key '{relative_path}' not found in store") @@ -764,6 +769,31 @@ def mutate(records: list[ArtifactRecord]) -> ArtifactRecord: return _mutate_records(mutate) +def _upsert_icechunk_artifact_record(record: ArtifactRecord, *, publish: bool) -> ArtifactRecord: + """Persist an Icechunk artifact record, replacing any existing record for the same store path. + + Matches by dataset_id + path rather than request_scope so that sync appends + (which extend the end date) update the existing record in-place instead of + accumulating duplicate entries for the same physical store. + """ + + def mutate(records: list[ArtifactRecord]) -> ArtifactRecord: + for i, existing in enumerate(records): + if existing.dataset_id == record.dataset_id and existing.path == record.path: + replacement = record.model_copy( + update={ + "artifact_id": existing.artifact_id, + "publication": existing.publication, + } + ) + records[i] = replacement + return replacement + records.append(record) + return record + + return _mutate_records(mutate) + + def _upsert_artifact_record( record: ArtifactRecord, *, diff --git a/docs/architecture.md b/docs/architecture.md index 78dd2411..25dbf5ec 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -261,7 +261,7 @@ The sync engine validates that new data connects to the end of the existing arti ### The append execution mode -For **function-path** datasets, `append` downloads only the missing time range and rebuilds the full zarr from all cached files. The local cache (NetCDF files in `data/downloads/`) is the source of truth; the zarr is a derived view. If the cache is deleted, a rematerialize is required to recover. +For **legacy ZARR datasets** (downloader-based, no `ingestion.plugin`), `append` downloads only the missing time range and rebuilds the full zarr from all cached files. The local cache (NetCDF files in `data/downloads/`) is the source of truth; the zarr is a derived view. If the cache is deleted, a rematerialize is required to recover. For **plugin-path** datasets, `append` compares the pending period list against the already-committed time coordinates in the Icechunk store and fetches only the missing periods. The Icechunk store itself is the source of truth — no separate download cache. A crash leaves the store at the last committed period; restart resumes from there without any additional recovery logic. diff --git a/tests/test_datasets_sync.py b/tests/test_datasets_sync.py index 63654784..7261b727 100644 --- a/tests/test_datasets_sync.py +++ b/tests/test_datasets_sync.py @@ -1059,6 +1059,7 @@ def fake_run_ingest_sync(**kwargs: object) -> None: monkeypatch.setattr(svc, "get_extent", lambda: None) monkeypatch.setattr(svc.downloader, "DOWNLOAD_DIR", tmp_path) monkeypatch.setattr(svc, "_store_artifact_record", lambda record, **_: record) + monkeypatch.setattr(svc, "_upsert_icechunk_artifact_record", lambda record, **_: record) class _FakeRepo: diff --git a/tests/test_ingest_orchestrator.py b/tests/test_ingest_orchestrator.py index adb99ca8..88671a53 100644 --- a/tests/test_ingest_orchestrator.py +++ b/tests/test_ingest_orchestrator.py @@ -44,6 +44,7 @@ class FakePlugin: max_concurrency = 2 commit_batch_size = 2 + rechunk_time: int | None = None def __init__(self, periods: list[str]) -> None: self._periods = periods @@ -504,3 +505,84 @@ def test_era5land_plugin_declares_rechunk_time() -> None: plugin = Era5LandPlugin(variable="t2m") assert plugin.rechunk_time == 12 + + +# --------------------------------------------------------------------------- +# time_dim=False (static datasets) +# --------------------------------------------------------------------------- + + +class FakeStaticPlugin(FakePlugin): + """FakePlugin variant whose probe returns time_dim=False (static dataset).""" + + async def probe(self, bbox: list[float], **params: Any) -> GridSpec: + return GridSpec(shape=(4, 4), crs=4326, dtype=np.dtype("float32"), nodata=None, time_dim=False) + + async def fetch_period(self, period_id: str, bbox: list[float], **params: Any) -> xr.Dataset: + self.fetched.append(period_id) + return xr.Dataset( + {"elevation": xr.DataArray(np.zeros((4, 4), dtype="float32"), dims=["y", "x"])}, + ) + + +def test_run_ingest_static_dataset_writes_once(tmp_path: Path) -> None: + """time_dim=False: the orchestrator commits only one write (no append) and the + store has no time dimension.""" + import icechunk + import zarr + + plugin = FakeStaticPlugin(["2024-01", "2024-02", "2024-03"]) + store_path = tmp_path / "static.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-03", + store_path=store_path, + period_type="monthly", + ) + ) + + assert store_path.exists() + # The store must exist and contain the static variable without a time axis. + repo = icechunk.Repository.open(icechunk.local_filesystem_storage(str(store_path))) + session = repo.readonly_session("main") + g = zarr.open_group(session.store, mode="r") + assert "elevation" in g + assert "time" not in g + + +# --------------------------------------------------------------------------- +# apply_transforms +# --------------------------------------------------------------------------- + + +def test_run_ingest_apply_transforms_called_per_period(tmp_path: Path) -> None: + """apply_transforms is invoked for every fetched period before writing.""" + plugin = FakePlugin(["2024-01", "2024-02"]) + store_path = tmp_path / "test.icechunk" + transform_calls: list[str] = [] + + def record_transform(ds: xr.Dataset) -> xr.Dataset: + transform_calls.append(str(ds.time.values[0])) + return ds + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-02", + store_path=store_path, + period_type="monthly", + apply_transforms=record_transform, + ) + ) + + assert len(transform_calls) == 2 + committed = read_committed_period_ids(store_path, "monthly") + assert committed == {"2024-01", "2024-02"} From 469a07313653fc8c5315898db5b09e862b56728d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 00:56:15 +0200 Subject: [PATCH 28/80] feat: derive sync availability from plugin.periods() instead of latest_available_function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sync engine now calls plugin.periods(next_period_start, requested_end) directly during planning for plugin-based datasets, instead of relying on a separate latest_available_function declared in the YAML. The lag cutoff lives in the plugin and is no longer duplicated. _latest_available_end gains a current_end parameter and delegates to the new _plugin_latest_available_period helper when ingestion.plugin is set. The helper instantiates the plugin (falling back gracefully on TypeError for plugins that need extra_params like country_code) and calls asyncio.run(plugin.periods()). An empty periods list means nothing new → returns current_end so the NOOP check fires. The legacy sync.availability / latest_available_function path is kept for datasets without a plugin. Remove the now-redundant availability blocks from era5_land.yaml. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/data/datasets/era5_land.yaml | 6 - climate_api/ingestions/sync_engine.py | 86 ++++++++- tests/test_datasets_sync.py | 223 +++++++++++++++++++++++ 3 files changed, 300 insertions(+), 15 deletions(-) diff --git a/climate_api/data/datasets/era5_land.yaml b/climate_api/data/datasets/era5_land.yaml index 00978f4a..58ca9e00 100644 --- a/climate_api/data/datasets/era5_land.yaml +++ b/climate_api/data/datasets/era5_land.yaml @@ -6,9 +6,6 @@ sync: kind: temporal execution: append - availability: - latest_available_function: climate_api.providers.availability.lagged_latest_available - lag_hours: 120 extents: spatial: bbox: [-180, -90, 180, 90] @@ -37,9 +34,6 @@ sync: kind: temporal execution: append - availability: - latest_available_function: climate_api.providers.availability.lagged_latest_available - lag_hours: 120 extents: spatial: bbox: [-180, -90, 180, 90] diff --git a/climate_api/ingestions/sync_engine.py b/climate_api/ingestions/sync_engine.py index 4f64ae82..798ab253 100644 --- a/climate_api/ingestions/sync_engine.py +++ b/climate_api/ingestions/sync_engine.py @@ -11,6 +11,7 @@ from __future__ import annotations +import asyncio import importlib import inspect import logging @@ -88,7 +89,9 @@ def plan_sync( resolved_end = normalize_period_string(normalized_requested_end, period_type) else: resolved_end = _default_target_end(period_type=period_type) - latest_available_end = _latest_available_end(source_dataset=source_dataset, requested_end=resolved_end) + latest_available_end = _latest_available_end( + source_dataset=source_dataset, requested_end=resolved_end, current_end=current_end + ) target_end_source = ( requested_target_end_source if latest_available_end == resolved_end @@ -334,13 +337,34 @@ def _default_target_end(*, period_type: str) -> str: raise ValueError(f"Unsupported period_type '{period_type}' for sync") -def _latest_available_end(*, source_dataset: dict[str, Any], requested_end: str) -> str: - """Clamp requested sync end to the latest upstream state declared by template metadata. +def _latest_available_end(*, source_dataset: dict[str, Any], requested_end: str, current_end: str | None = None) -> str: + """Clamp requested sync end to the latest upstream state. + + For plugin datasets (ingestion.plugin set) the plugin's own periods() method + is the authoritative availability source — no separate latest_available_function + is needed in the YAML. The lag cutoff lives in the plugin and is not duplicated. - The current engine does not query upstream providers directly. Instead it can - apply conservative template metadata so sync planning does not overshoot known - provider lag or release cadence. + For legacy ZARR datasets the existing sync.availability metadata (lag_days or + latest_available_function) is used unchanged. + + current_end must be provided for the plugin path so the function can return it + when periods() reports nothing new (empty list → NOOP detected by caller). """ + period_type = source_dataset.get("period_type") + if current_end is not None and isinstance(period_type, str): + ingestion = source_dataset.get("ingestion") + if isinstance(ingestion, dict) and isinstance(ingestion.get("plugin"), str): + next_start = _next_period_start(current_end, period_type=period_type) + plugin_latest = _plugin_latest_available_period( + source_dataset=source_dataset, + next_period_start=next_start, + requested_end=requested_end, + current_end=current_end, + ) + if plugin_latest is not None: + return min(requested_end, plugin_latest) + + # Legacy path: lag metadata or latest_available_function in sync.availability. availability = source_dataset.get("sync", {}).get("availability") if not isinstance(availability, dict): return requested_end @@ -352,9 +376,6 @@ def _latest_available_end(*, source_dataset: dict[str, Any], requested_end: str) ) if provider_latest is not None: return min(requested_end, provider_latest) - # Keep the legacy metadata-only lag fallback for templates that do not yet - # declare a latest_available_function, but delegate to the provider helper - # so lag logic lives in one place. return min( requested_end, provider_availability.lagged_latest_available( @@ -364,6 +385,53 @@ def _latest_available_end(*, source_dataset: dict[str, Any], requested_end: str) ) +def _plugin_latest_available_period( + *, + source_dataset: dict[str, Any], + next_period_start: str, + requested_end: str, + current_end: str, +) -> str | None: + """Return the last period available from next_period_start..requested_end via the plugin. + + Returns: + - str: the last available period in the range (may equal current_end when nothing new) + - None: plugin could not be instantiated (caller falls back to legacy availability logic) + + Calls asyncio.run() which requires no running event loop — plan_sync is synchronous + and FastAPI runs sync handlers in a thread pool, so this is safe. + """ + ingestion = source_dataset.get("ingestion") + if not isinstance(ingestion, dict): + return None + plugin_path = ingestion.get("plugin") + if not isinstance(plugin_path, str): + return None + + _raw_params = ingestion.get("params") + params: dict[str, Any] = dict(_raw_params) if isinstance(_raw_params, dict) else {} + + try: + from climate_api.ingest.orchestrator import load_plugin + + plugin = load_plugin(plugin_path, params) + except (TypeError, ValueError, ImportError, AttributeError) as exc: + logger.debug( + "Plugin '%s' cannot be instantiated for availability check (needs extra_params?): %s", + plugin_path, + exc, + ) + return None + + try: + periods = asyncio.run(plugin.periods(next_period_start, requested_end)) + except Exception as exc: + logger.debug("plugin.periods() failed during availability check for '%s': %s", plugin_path, exc) + return None + + return periods[-1] if periods else current_end + + def _supports_append(source_dataset: dict[str, Any], latest_artifact: ArtifactRecord) -> bool: """Return whether this artifact supports incremental append sync execution. diff --git a/tests/test_datasets_sync.py b/tests/test_datasets_sync.py index 7261b727..6ce241f4 100644 --- a/tests/test_datasets_sync.py +++ b/tests/test_datasets_sync.py @@ -748,6 +748,229 @@ def test_latest_available_end_wraps_invalid_provider_function_path(monkeypatch: ) +def test_latest_available_end_uses_plugin_periods_for_plugin_datasets( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """For plugin datasets, _latest_available_end calls plugin.periods() instead of + latest_available_function.""" + monkeypatch.setattr( + sync_engine, + "_plugin_latest_available_period", + lambda *, source_dataset, next_period_start, requested_end, current_end: "2026-02-08", + ) + + result = sync_engine._latest_available_end( + source_dataset={ + "id": "era5land_temperature_hourly", + "period_type": "daily", + "sync": {"kind": "temporal"}, + "ingestion": {"plugin": "some.Plugin", "params": {}}, + }, + requested_end="2026-02-10", + current_end="2026-02-06", + ) + + assert result == "2026-02-08" + + +def test_latest_available_end_plugin_returns_current_end_when_no_new_periods( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When plugin.periods() returns an empty list, _latest_available_end returns current_end + so the NOOP check fires correctly.""" + monkeypatch.setattr( + sync_engine, + "_plugin_latest_available_period", + lambda *, source_dataset, next_period_start, requested_end, current_end: current_end, + ) + + result = sync_engine._latest_available_end( + source_dataset={ + "id": "era5land_temperature_hourly", + "period_type": "daily", + "sync": {"kind": "temporal"}, + "ingestion": {"plugin": "some.Plugin", "params": {}}, + }, + requested_end="2026-02-10", + current_end="2026-02-06", + ) + + assert result == "2026-02-06" + + +def test_latest_available_end_falls_back_to_legacy_when_plugin_unavailable( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When _plugin_latest_available_period returns None (e.g. plugin needs extra_params), + the legacy latest_available_function path is used.""" + monkeypatch.setattr( + sync_engine, + "_plugin_latest_available_period", + lambda **_kw: None, + ) + monkeypatch.setattr(sync_engine, "_get_dynamic_function", lambda _: lambda: "2026-02-05") + + result = sync_engine._latest_available_end( + source_dataset={ + "id": "worldpop_population_yearly", + "period_type": "daily", + "sync": {"availability": {"latest_available_function": "provider.latest_available"}}, + "ingestion": {"plugin": "some.Plugin", "params": {}}, + }, + requested_end="2026-02-10", + current_end="2026-02-06", + ) + + assert result == "2026-02-05" + + +def test_plugin_latest_available_period_returns_last_period() -> None: + """_plugin_latest_available_period returns the last item from plugin.periods().""" + + class FakePlugin: + max_concurrency = 1 + commit_batch_size = 1 + rechunk_time = None + + async def probe(self, *_a: object, **_k: object) -> object: # type: ignore[override] + raise NotImplementedError + + async def periods(self, start: str, end: str) -> list[str]: + return [d for d in ["2026-02-07", "2026-02-08", "2026-02-09"] if start <= d <= end] + + async def fetch_period(self, *_a: object, **_k: object) -> object: # type: ignore[override] + raise NotImplementedError + + import climate_api.ingest.orchestrator as orch_mod + + orig = orch_mod.load_plugin + orch_mod.load_plugin = lambda path, params, extra_params=None: FakePlugin() # type: ignore[assignment] + try: + result = sync_engine._plugin_latest_available_period( + source_dataset={"ingestion": {"plugin": "fake.Plugin", "params": {}}}, + next_period_start="2026-02-07", + requested_end="2026-02-10", + current_end="2026-02-06", + ) + finally: + orch_mod.load_plugin = orig + + assert result == "2026-02-09" + + +def test_plugin_latest_available_period_returns_current_end_when_empty() -> None: + """When plugin.periods() returns [], _plugin_latest_available_period returns current_end.""" + + class EmptyPlugin: + max_concurrency = 1 + commit_batch_size = 1 + rechunk_time = None + + async def probe(self, *_a: object, **_k: object) -> object: # type: ignore[override] + raise NotImplementedError + + async def periods(self, start: str, end: str) -> list[str]: + return [] + + async def fetch_period(self, *_a: object, **_k: object) -> object: # type: ignore[override] + raise NotImplementedError + + import climate_api.ingest.orchestrator as orch_mod + + orig = orch_mod.load_plugin + orch_mod.load_plugin = lambda *_a, **_kw: EmptyPlugin() # type: ignore[assignment] + try: + result = sync_engine._plugin_latest_available_period( + source_dataset={"ingestion": {"plugin": "fake.Plugin", "params": {}}}, + next_period_start="2026-02-07", + requested_end="2026-02-10", + current_end="2026-02-06", + ) + finally: + orch_mod.load_plugin = orig + + assert result == "2026-02-06" + + +def test_plugin_latest_available_period_returns_none_on_instantiation_failure() -> None: + """TypeError during load_plugin (e.g. plugin needs country_code) → returns None.""" + import climate_api.ingest.orchestrator as orch_mod + + orig = orch_mod.load_plugin + + def explode(path: str, params: dict, extra_params: object = None) -> object: + raise TypeError("country_code is required") + + orch_mod.load_plugin = explode # type: ignore[assignment] + try: + result = sync_engine._plugin_latest_available_period( + source_dataset={"ingestion": {"plugin": "worldpop.Plugin", "params": {}}}, + next_period_start="2026", + requested_end="2026", + current_end="2025", + ) + finally: + orch_mod.load_plugin = orig + + assert result is None + + +def test_plan_sync_uses_plugin_periods_for_availability(monkeypatch: pytest.MonkeyPatch) -> None: + """For an ICECHUNK artifact backed by a plugin, plan_sync calls plugin.periods() to + determine whether new data is available, not a static lag function.""" + monkeypatch.setattr( + sync_engine, + "_plugin_latest_available_period", + lambda *, source_dataset, next_period_start, requested_end, current_end: "2024-01-01T05", + ) + + artifact = _icechunk_artifact(artifact_id="a1", end="2024-01-01T03") + result = sync_engine.plan_sync( + source_dataset={ + "id": "era5land_temperature_hourly", + "period_type": "hourly", + "sync": {"kind": "temporal"}, + "ingestion": { + "plugin": "climate_api.ingest.plugins.era5_land.Era5LandPlugin", + "params": {"variable": "t2m"}, + }, + }, + latest_artifact=artifact, + requested_end="2024-01-01T10", + ) + + assert result.action == "append" + assert result.target_end == "2024-01-01T05" + assert result.delta_start == "2024-01-01T04" + assert result.delta_end == "2024-01-01T05" + + +def test_plan_sync_noop_when_plugin_reports_no_new_periods(monkeypatch: pytest.MonkeyPatch) -> None: + """plan_sync returns NO_OP when plugin.periods() is empty (nothing new since current_end).""" + monkeypatch.setattr( + sync_engine, + "_plugin_latest_available_period", + lambda *, source_dataset, next_period_start, requested_end, current_end: current_end, + ) + + artifact = _icechunk_artifact(artifact_id="a1", end="2024-01-01T03") + result = sync_engine.plan_sync( + source_dataset={ + "id": "era5land_temperature_hourly", + "period_type": "hourly", + "sync": {"kind": "temporal"}, + "ingestion": { + "plugin": "climate_api.ingest.plugins.era5_land.Era5LandPlugin", + "params": {"variable": "t2m"}, + }, + }, + latest_artifact=artifact, + requested_end="2024-01-01T10", + ) + + assert result.action == "no_op" + + def test_sync_plan_route_returns_500_for_provider_hook_misconfiguration( client: TestClient, monkeypatch: pytest.MonkeyPatch, From 1c27272656be588818104c63e2a1327d4ccb976b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 01:03:46 +0200 Subject: [PATCH 29/80] =?UTF-8?q?feat:=20remove=20chirps3/worldpop=20lates?= =?UTF-8?q?t=5Favailable=5Ffunction=20=E2=80=94=20plugin.periods()=20owns?= =?UTF-8?q?=20availability?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Chirps3Plugin._availability_cutoff uses _COMPLETE_AFTER_DAY = 20 which is identical to chirps3_daily_latest_available. WorldPopPlugin._build_periods clamps to (2015, 2030) for global2, equivalent to the allow_future behaviour but correctly bounded. Make WorldPopPlugin.country_code optional (default "") so load_plugin succeeds for planning without the extent-injected country code — country_code is only needed in fetch_period, not periods(). Remove chirps3_daily_latest_available, worldpop_release_latest_available, and their helpers (_add_months, monthrange import) from availability.py. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/data/datasets/chirps3.yaml | 2 -- climate_api/data/datasets/worldpop.yaml | 4 --- climate_api/ingest/plugins/worldpop.py | 2 +- climate_api/providers/availability.py | 44 +----------------------- tests/test_datasets_sync.py | 8 ++--- tests/test_provider_availability.py | 45 ------------------------- 6 files changed, 5 insertions(+), 100 deletions(-) diff --git a/climate_api/data/datasets/chirps3.yaml b/climate_api/data/datasets/chirps3.yaml index 076e8784..c1218e6b 100644 --- a/climate_api/data/datasets/chirps3.yaml +++ b/climate_api/data/datasets/chirps3.yaml @@ -6,8 +6,6 @@ sync: kind: temporal execution: append - availability: - latest_available_function: climate_api.providers.availability.chirps3_daily_latest_available extents: spatial: bbox: [-180, -50, 180, 50] diff --git a/climate_api/data/datasets/worldpop.yaml b/climate_api/data/datasets/worldpop.yaml index fcdaa921..448fbdc6 100644 --- a/climate_api/data/datasets/worldpop.yaml +++ b/climate_api/data/datasets/worldpop.yaml @@ -5,10 +5,6 @@ period_type: yearly sync: kind: release - availability: - latest_available_function: climate_api.providers.availability.worldpop_release_latest_available - # WorldPop projections are intentionally request-driven for future years. - allow_future: true extents: spatial: bbox: [-180, -90, 180, 90] diff --git a/climate_api/ingest/plugins/worldpop.py b/climate_api/ingest/plugins/worldpop.py index 4d95b3d2..305c8ec1 100644 --- a/climate_api/ingest/plugins/worldpop.py +++ b/climate_api/ingest/plugins/worldpop.py @@ -47,7 +47,7 @@ class WorldPopPlugin: commit_batch_size = 1 rechunk_time: int | None = None - def __init__(self, country_code: str, version: str = "global2") -> None: + def __init__(self, country_code: str = "", version: str = "global2") -> None: self.country_code = country_code.upper() self.version = version diff --git a/climate_api/providers/availability.py b/climate_api/providers/availability.py index 1969d767..9205fe47 100644 --- a/climate_api/providers/availability.py +++ b/climate_api/providers/availability.py @@ -7,33 +7,12 @@ from __future__ import annotations -from calendar import monthrange -from datetime import date, timedelta +from datetime import timedelta from typing import Any from climate_api.shared.time import datetime_to_period_string, utc_now, utc_today -def chirps3_daily_latest_available(*, dataset: dict[str, Any], requested_end: str) -> str: - """Return latest complete CHIRPS3 daily period available for safe sync. - - The dhis2eo CHIRPS3 downloader groups daily files by source month. For - final/rnl data, use only fully released months by default: after the 20th, - the previous month is considered available; otherwise the month before that - is the latest safe complete month. - """ - availability = _availability_metadata(dataset) - threshold_day = availability.get("complete_month_after_day", 20) - if not isinstance(threshold_day, int): - threshold_day = 20 - - today = utc_today() - months_back = 1 if today.day > threshold_day else 2 - available_month = _add_months(today.replace(day=1), -months_back) - latest_day = monthrange(available_month.year, available_month.month)[1] - return date(available_month.year, available_month.month, latest_day).isoformat() - - def lagged_latest_available(*, dataset: dict[str, Any], requested_end: str) -> str: """Return latest available period by applying YAML-declared lag metadata.""" availability = _availability_metadata(dataset) @@ -61,28 +40,7 @@ def lagged_latest_available(*, dataset: dict[str, Any], requested_end: str) -> s return requested_end -def worldpop_release_latest_available(*, dataset: dict[str, Any], requested_end: str) -> str: - """Return WorldPop release availability, including configured projections.""" - availability = _availability_metadata(dataset) - if availability.get("allow_future") is True: - return requested_end - - latest_year = availability.get("latest_year") - if isinstance(latest_year, int): - return str(latest_year) - - return lagged_latest_available(dataset=dataset, requested_end=requested_end) - - def _availability_metadata(dataset: dict[str, Any]) -> dict[str, Any]: """Return sync availability metadata from a dataset template.""" availability = dataset.get("sync", {}).get("availability") return availability if isinstance(availability, dict) else {} - - -def _add_months(value: date, offset: int) -> date: - """Add a month offset to the first day of a month.""" - month_index = value.year * 12 + value.month - 1 + offset - year = month_index // 12 - month = month_index % 12 + 1 - return date(year, month, 1) diff --git a/tests/test_datasets_sync.py b/tests/test_datasets_sync.py index 6ce241f4..c27d1282 100644 --- a/tests/test_datasets_sync.py +++ b/tests/test_datasets_sync.py @@ -623,16 +623,14 @@ def test_plan_sync_marks_request_target_clamped_by_availability(monkeypatch: pyt result = sync_engine.plan_sync( source_dataset={ - "id": "chirps3_precipitation_daily", + "id": "legacy_dataset", "period_type": "daily", "sync": { "kind": "temporal", "execution": "append", - "availability": { - "latest_available_function": "climate_api.providers.availability.chirps3_daily_latest_available" - }, + "availability": {"latest_available_function": "provider.latest_available"}, }, - "ingestion": {}, + # No ingestion.plugin — exercises the legacy latest_available_function path. }, latest_artifact=_artifact(artifact_id="a1", end="2026-02-28"), requested_end="2026-04-21", diff --git a/tests/test_provider_availability.py b/tests/test_provider_availability.py index 83ff8faf..50a1fd06 100644 --- a/tests/test_provider_availability.py +++ b/tests/test_provider_availability.py @@ -5,42 +5,6 @@ from climate_api.providers import availability -def test_chirps3_daily_latest_available_uses_previous_complete_month_after_threshold( - monkeypatch: pytest.MonkeyPatch, -) -> None: - class FixedDate(date): - @classmethod - def today(cls) -> "FixedDate": - return cls(2026, 4, 21) - - monkeypatch.setattr(availability, "utc_today", lambda: FixedDate(2026, 4, 21)) - - result = availability.chirps3_daily_latest_available( - dataset={"sync": {"availability": {"complete_month_after_day": 20}}}, - requested_end="2026-04-21", - ) - - assert result == "2026-03-31" - - -def test_chirps3_daily_latest_available_uses_month_before_previous_on_threshold_day( - monkeypatch: pytest.MonkeyPatch, -) -> None: - class FixedDate(date): - @classmethod - def today(cls) -> "FixedDate": - return cls(2026, 4, 20) - - monkeypatch.setattr(availability, "utc_today", lambda: FixedDate(2026, 4, 20)) - - result = availability.chirps3_daily_latest_available( - dataset={"sync": {"availability": {"complete_month_after_day": 20}}}, - requested_end="2026-04-20", - ) - - assert result == "2026-02-28" - - def test_lagged_latest_available_formats_hourly_lag(monkeypatch: pytest.MonkeyPatch) -> None: class FixedDateTime(datetime): @classmethod @@ -79,15 +43,6 @@ def today(cls) -> "FixedDate": assert result == "2026-04-19" -def test_worldpop_release_latest_available_allows_configured_future_projection() -> None: - result = availability.worldpop_release_latest_available( - dataset={"period_type": "yearly", "sync": {"availability": {"allow_future": True}}}, - requested_end="2030", - ) - - assert result == "2030" - - def test_lagged_latest_available_formats_yearly_offset(monkeypatch: pytest.MonkeyPatch) -> None: class FixedDate(date): @classmethod From 159539bfeaaa332b572ed04670c7d02b6ebb3dc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 01:08:43 +0200 Subject: [PATCH 30/80] =?UTF-8?q?refactor:=20remove=20legacy=20latest=5Fav?= =?UTF-8?q?ailable=5Ffunction=20dispatch=20=E2=80=94=20plugin.periods()=20?= =?UTF-8?q?is=20the=20only=20availability=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Delete availability.py entirely: lagged_latest_available and its helpers are no longer called. Remove _provider_latest_available_end, _get_dynamic_function, and the import inspect / provider_availability import from sync_engine.py. _latest_available_end now only consults the plugin; datasets without a plugin fall through to requested_end unchanged (which equals the default today value). Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingestions/sync_engine.py | 80 +------- climate_api/providers/availability.py | 46 ----- tests/test_datasets_sync.py | 255 -------------------------- tests/test_provider_availability.py | 62 ------- 4 files changed, 4 insertions(+), 439 deletions(-) delete mode 100644 climate_api/providers/availability.py delete mode 100644 tests/test_provider_availability.py diff --git a/climate_api/ingestions/sync_engine.py b/climate_api/ingestions/sync_engine.py index 798ab253..4236df3e 100644 --- a/climate_api/ingestions/sync_engine.py +++ b/climate_api/ingestions/sync_engine.py @@ -12,15 +12,12 @@ from __future__ import annotations import asyncio -import importlib -import inspect import logging from collections.abc import Callable from datetime import date, datetime, time, timedelta from typing import Any from climate_api.ingestions.schemas import ArtifactRecord, SyncAction, SyncDetail, SyncKind, SyncResponse -from climate_api.providers import availability as provider_availability from climate_api.publications.services import managed_dataset_id_for from climate_api.shared.time import ( datetime_to_period_string, @@ -338,17 +335,10 @@ def _default_target_end(*, period_type: str) -> str: def _latest_available_end(*, source_dataset: dict[str, Any], requested_end: str, current_end: str | None = None) -> str: - """Clamp requested sync end to the latest upstream state. + """Clamp requested sync end to the latest upstream state via the plugin's periods() method. - For plugin datasets (ingestion.plugin set) the plugin's own periods() method - is the authoritative availability source — no separate latest_available_function - is needed in the YAML. The lag cutoff lives in the plugin and is not duplicated. - - For legacy ZARR datasets the existing sync.availability metadata (lag_days or - latest_available_function) is used unchanged. - - current_end must be provided for the plugin path so the function can return it - when periods() reports nothing new (empty list → NOOP detected by caller). + current_end must be provided so the function can return it when periods() reports nothing + new (empty list → NOOP detected by caller). """ period_type = source_dataset.get("period_type") if current_end is not None and isinstance(period_type, str): @@ -364,25 +354,7 @@ def _latest_available_end(*, source_dataset: dict[str, Any], requested_end: str, if plugin_latest is not None: return min(requested_end, plugin_latest) - # Legacy path: lag metadata or latest_available_function in sync.availability. - availability = source_dataset.get("sync", {}).get("availability") - if not isinstance(availability, dict): - return requested_end - - provider_latest = _provider_latest_available_end( - source_dataset=source_dataset, - availability=availability, - requested_end=requested_end, - ) - if provider_latest is not None: - return min(requested_end, provider_latest) - return min( - requested_end, - provider_availability.lagged_latest_available( - dataset=source_dataset, - requested_end=requested_end, - ), - ) + return requested_end def _plugin_latest_available_period( @@ -462,47 +434,3 @@ def _supports_append(source_dataset: dict[str, Any], latest_artifact: ArtifactRe ) return False return True - - -def _provider_latest_available_end( - *, - source_dataset: dict[str, Any], - availability: dict[str, Any], - requested_end: str, -) -> str | None: - """Call an optional provider-specific latest-availability function.""" - function_path = availability.get("latest_available_function") - if not isinstance(function_path, str) or not function_path: - return None - - try: - latest_available_fn = _get_dynamic_function(function_path) - params: dict[str, Any] = {} - signature = inspect.signature(latest_available_fn) - if "dataset" in signature.parameters: - params["dataset"] = source_dataset - if "requested_end" in signature.parameters: - params["requested_end"] = requested_end - result = latest_available_fn(**params) - except (AttributeError, ImportError, TypeError, ValueError) as exc: - raise SyncConfigurationError(f"Latest availability function '{function_path}' failed: {exc}") from exc - if not isinstance(result, str): - raise SyncConfigurationError(f"Latest availability function '{function_path}' must return a period string") - try: - return normalize_period_string(result, period_type=str(source_dataset["period_type"])) - except (KeyError, TypeError, ValueError) as exc: - raise SyncConfigurationError( - f"Latest availability function '{function_path}' returned invalid period " - f"'{result}' for dataset period_type '{source_dataset.get('period_type')}'" - ) from exc - - -def _get_dynamic_function(full_path: str) -> Callable[..., Any]: - """Import and return a function given its dotted module path.""" - parts = full_path.split(".") - if len(parts) < 2 or any(not part for part in parts): - raise ValueError(f"Invalid dotted function path '{full_path}'") - module_path = ".".join(parts[:-1]) - function_name = parts[-1] - module = importlib.import_module(module_path) - return getattr(module, function_name) # type: ignore[no-any-return] diff --git a/climate_api/providers/availability.py b/climate_api/providers/availability.py deleted file mode 100644 index 9205fe47..00000000 --- a/climate_api/providers/availability.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Provider availability policies used by sync planning. - -These functions keep source-specific release cadence rules out of the generic -sync engine. They are intentionally small and metadata-driven so dataset YAML can -choose the right policy per upstream provider. -""" - -from __future__ import annotations - -from datetime import timedelta -from typing import Any - -from climate_api.shared.time import datetime_to_period_string, utc_now, utc_today - - -def lagged_latest_available(*, dataset: dict[str, Any], requested_end: str) -> str: - """Return latest available period by applying YAML-declared lag metadata.""" - availability = _availability_metadata(dataset) - period_type = str(dataset.get("period_type", "daily")) - - if period_type == "hourly": - lag_hours = availability.get("lag_hours") - if isinstance(lag_hours, int) and lag_hours > 0: - latest = utc_now() - timedelta(hours=lag_hours) - return datetime_to_period_string(latest, period_type) - return requested_end - - lag_days = availability.get("lag_days") - if period_type in {"daily", "monthly"} and isinstance(lag_days, int) and lag_days > 0: - latest_date = utc_today() - timedelta(days=lag_days) - if period_type == "monthly": - return f"{latest_date.year:04d}-{latest_date.month:02d}" - return latest_date.isoformat() - - if period_type == "yearly": - latest_year_offset = availability.get("latest_year_offset") - if isinstance(latest_year_offset, int) and latest_year_offset >= 0: - return str(utc_today().year - latest_year_offset) - - return requested_end - - -def _availability_metadata(dataset: dict[str, Any]) -> dict[str, Any]: - """Return sync availability metadata from a dataset template.""" - availability = dataset.get("sync", {}).get("availability") - return availability if isinstance(availability, dict) else {} diff --git a/tests/test_datasets_sync.py b/tests/test_datasets_sync.py index c27d1282..0db02e0c 100644 --- a/tests/test_datasets_sync.py +++ b/tests/test_datasets_sync.py @@ -287,59 +287,6 @@ def test_sync_dataset_release_policy_returns_up_to_date_when_release_matches(mon assert result.sync_detail.reason == "no_new_release" -def test_sync_dataset_release_policy_clamps_future_year_by_template_availability( - monkeypatch: pytest.MonkeyPatch, -) -> None: - dataset_id = "release_dataset_sle" - latest = _artifact( - artifact_id="a1", - source_dataset_id="release_dataset_yearly", - managed_dataset_id=dataset_id, - end="2024", - ) - monkeypatch.setattr(services, "get_latest_artifact_for_dataset_or_404", lambda _: latest) - monkeypatch.setattr( - sync_engine.provider_availability, - "utc_today", - lambda: date(2026, 4, 15), - ) - monkeypatch.setattr( - services.registry_datasets, - "get_dataset", - lambda _: { - "id": "release_dataset_yearly", - "period_type": "yearly", - "sync": {"kind": "release", "availability": {"latest_year_offset": 1}}, - }, - ) - - captured: dict[str, object] = {} - - def fake_create_artifact(**kwargs: object) -> ArtifactRecord: - captured.update(kwargs) - return _artifact( - artifact_id="a2", - source_dataset_id="release_dataset_yearly", - managed_dataset_id=dataset_id, - end="2025", - ) - - monkeypatch.setattr(services, "create_artifact", fake_create_artifact) - monkeypatch.setattr(services, "get_dataset_or_404", lambda _: _dataset_detail(dataset_id)) - - result = services.sync_dataset(dataset_id=dataset_id, end="2026", prefer_zarr=True, publish=True) - - assert captured["start"] == "2026-01-01" - assert captured["end"] == "2025" - assert result.status == "completed" - assert result.sync_detail.sync_kind == SyncKind.RELEASE - assert result.sync_detail.action == SyncAction.REMATERIALIZE - assert result.sync_detail.target_end == "2025" - assert result.sync_detail.target_end_source == "request_clamped_by_availability" - assert result.sync_detail.delta_start is None - assert result.sync_detail.delta_end is None - - def test_default_hourly_target_end_is_utc_aware(monkeypatch: pytest.MonkeyPatch) -> None: class FixedDateTime(datetime): @classmethod @@ -573,26 +520,6 @@ def test_sync_route_executes_rematerialize_and_returns_structured_detail( assert payload["sync_detail"]["target_end"] == "2026-02-10" -def test_latest_available_end_preserves_requested_month_without_lag(monkeypatch: pytest.MonkeyPatch) -> None: - class FixedDate(date): - @classmethod - def today(cls) -> "FixedDate": - return cls(2026, 4, 15) - - monkeypatch.setattr(sync_engine.provider_availability, "utc_today", lambda: FixedDate(2026, 4, 15)) - - result = sync_engine._latest_available_end( - source_dataset={ - "id": "monthly_dataset", - "period_type": "monthly", - "sync": {"availability": {"lag_days": 0}}, - }, - requested_end="2026-05", - ) - - assert result == "2026-05" - - def test_plan_sync_marks_default_target_end_source(monkeypatch: pytest.MonkeyPatch) -> None: class FixedDate(date): @classmethod @@ -618,134 +545,6 @@ def today(cls) -> "FixedDate": assert result.delta_end == "2026-04-20" -def test_plan_sync_marks_request_target_clamped_by_availability(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr(sync_engine, "_get_dynamic_function", lambda _: lambda: "2026-03-31") - - result = sync_engine.plan_sync( - source_dataset={ - "id": "legacy_dataset", - "period_type": "daily", - "sync": { - "kind": "temporal", - "execution": "append", - "availability": {"latest_available_function": "provider.latest_available"}, - }, - # No ingestion.plugin — exercises the legacy latest_available_function path. - }, - latest_artifact=_artifact(artifact_id="a1", end="2026-02-28"), - requested_end="2026-04-21", - ) - - assert result.target_end == "2026-03-31" - assert result.target_end_source == "request_clamped_by_availability" - assert result.delta_start == "2026-03-01" - assert result.delta_end == "2026-03-31" - - -def test_latest_available_end_clamps_monthly_lag_to_month_period(monkeypatch: pytest.MonkeyPatch) -> None: - class FixedDate(date): - @classmethod - def today(cls) -> "FixedDate": - return cls(2026, 4, 15) - - monkeypatch.setattr(sync_engine.provider_availability, "utc_today", lambda: FixedDate(2026, 4, 15)) - - result = sync_engine._latest_available_end( - source_dataset={ - "id": "monthly_dataset", - "period_type": "monthly", - "sync": {"availability": {"lag_days": 1}}, - }, - requested_end="2026-05", - ) - - assert result == "2026-04" - - -def test_latest_available_end_uses_provider_availability_hook(monkeypatch: pytest.MonkeyPatch) -> None: - calls: list[dict[str, object]] = [] - - def fake_latest_available(*, dataset: dict[str, object], requested_end: str) -> str: - calls.append({"dataset": dataset, "requested_end": requested_end}) - return "2026-02-05" - - monkeypatch.setattr(sync_engine, "_get_dynamic_function", lambda _: fake_latest_available) - - source_dataset = { - "id": "provider_dataset", - "period_type": "daily", - "sync": {"availability": {"latest_available_function": "provider.latest_available"}}, - } - result = sync_engine._latest_available_end(source_dataset=source_dataset, requested_end="2026-02-10") - - assert result == "2026-02-05" - assert calls == [{"dataset": source_dataset, "requested_end": "2026-02-10"}] - - -def test_latest_available_end_clamps_provider_availability_to_requested_end(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr(sync_engine, "_get_dynamic_function", lambda _: lambda: "2026-03-01") - - result = sync_engine._latest_available_end( - source_dataset={ - "id": "provider_dataset", - "period_type": "daily", - "sync": {"availability": {"latest_available_function": "provider.latest_available"}}, - }, - requested_end="2026-02-10", - ) - - assert result == "2026-02-10" - - -def test_latest_available_end_wraps_provider_import_errors(monkeypatch: pytest.MonkeyPatch) -> None: - def fail_import(_: str) -> object: - raise ImportError("missing provider") - - monkeypatch.setattr(sync_engine, "_get_dynamic_function", fail_import) - - with pytest.raises( - sync_engine.SyncConfigurationError, - match="Latest availability function 'provider.latest_available' failed", - ): - sync_engine._latest_available_end( - source_dataset={ - "id": "provider_dataset", - "period_type": "daily", - "sync": {"availability": {"latest_available_function": "provider.latest_available"}}, - }, - requested_end="2026-02-10", - ) - - -def test_latest_available_end_rejects_invalid_provider_period_string(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr(sync_engine, "_get_dynamic_function", lambda _: lambda: "2026-31-99") - - with pytest.raises( - sync_engine.SyncConfigurationError, - match="Latest availability function 'provider.latest_available' returned invalid period", - ): - sync_engine._latest_available_end( - source_dataset={ - "id": "provider_dataset", - "period_type": "daily", - "sync": {"availability": {"latest_available_function": "provider.latest_available"}}, - }, - requested_end="2026-02-10", - ) - - -def test_latest_available_end_wraps_invalid_provider_function_path(monkeypatch: pytest.MonkeyPatch) -> None: - with pytest.raises(sync_engine.SyncConfigurationError, match="Latest availability function 'invalid_path' failed"): - sync_engine._latest_available_end( - source_dataset={ - "id": "provider_dataset", - "period_type": "daily", - "sync": {"availability": {"latest_available_function": "invalid_path"}}, - }, - requested_end="2026-02-10", - ) - - def test_latest_available_end_uses_plugin_periods_for_plugin_datasets( monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -796,32 +595,6 @@ def test_latest_available_end_plugin_returns_current_end_when_no_new_periods( assert result == "2026-02-06" -def test_latest_available_end_falls_back_to_legacy_when_plugin_unavailable( - monkeypatch: pytest.MonkeyPatch, -) -> None: - """When _plugin_latest_available_period returns None (e.g. plugin needs extra_params), - the legacy latest_available_function path is used.""" - monkeypatch.setattr( - sync_engine, - "_plugin_latest_available_period", - lambda **_kw: None, - ) - monkeypatch.setattr(sync_engine, "_get_dynamic_function", lambda _: lambda: "2026-02-05") - - result = sync_engine._latest_available_end( - source_dataset={ - "id": "worldpop_population_yearly", - "period_type": "daily", - "sync": {"availability": {"latest_available_function": "provider.latest_available"}}, - "ingestion": {"plugin": "some.Plugin", "params": {}}, - }, - requested_end="2026-02-10", - current_end="2026-02-06", - ) - - assert result == "2026-02-05" - - def test_plugin_latest_available_period_returns_last_period() -> None: """_plugin_latest_available_period returns the last item from plugin.periods().""" @@ -969,34 +742,6 @@ def test_plan_sync_noop_when_plugin_reports_no_new_periods(monkeypatch: pytest.M assert result.action == "no_op" -def test_sync_plan_route_returns_500_for_provider_hook_misconfiguration( - client: TestClient, - monkeypatch: pytest.MonkeyPatch, -) -> None: - dataset_id = "chirps3_precipitation_daily_sle" - latest = _artifact(artifact_id="a1", managed_dataset_id=dataset_id, end="2026-01-31") - monkeypatch.setattr(services, "get_latest_artifact_for_dataset_or_404", lambda _: latest) - monkeypatch.setattr( - services.registry_datasets, - "get_dataset", - lambda _: { - "id": "chirps3_precipitation_daily", - "period_type": "daily", - "sync": {"kind": "temporal", "availability": {"latest_available_function": "provider.latest_available"}}, - }, - ) - - def fail_import(_: str) -> object: - raise ImportError("missing provider") - - monkeypatch.setattr(sync_engine, "_get_dynamic_function", fail_import) - - response = client.get(f"/sync/{dataset_id}/plan", params={"end": "2026-02-10"}) - - assert response.status_code == 500 - assert "Latest availability function 'provider.latest_available' failed" in response.json()["detail"] - - def test_run_sync_raises_clear_error_when_append_invariants_are_missing(monkeypatch: pytest.MonkeyPatch) -> None: latest_artifact = _artifact( artifact_id="a1", diff --git a/tests/test_provider_availability.py b/tests/test_provider_availability.py deleted file mode 100644 index 50a1fd06..00000000 --- a/tests/test_provider_availability.py +++ /dev/null @@ -1,62 +0,0 @@ -from datetime import UTC, date, datetime - -import pytest - -from climate_api.providers import availability - - -def test_lagged_latest_available_formats_hourly_lag(monkeypatch: pytest.MonkeyPatch) -> None: - class FixedDateTime(datetime): - @classmethod - def now(cls, tz: object = None) -> "FixedDateTime": # noqa: ANN401 - return cls(2026, 4, 21, 12, 34, tzinfo=UTC) - - monkeypatch.setattr(availability, "utc_now", lambda: FixedDateTime(2026, 4, 21, 12, 34, tzinfo=UTC)) - - result = availability.lagged_latest_available( - dataset={ - "period_type": "hourly", - "sync": {"availability": {"lag_hours": 5}}, - }, - requested_end="2026-04-21T12:00:00", - ) - - assert result == "2026-04-21T07" - - -def test_lagged_latest_available_formats_daily_lag(monkeypatch: pytest.MonkeyPatch) -> None: - class FixedDate(date): - @classmethod - def today(cls) -> "FixedDate": - return cls(2026, 4, 21) - - monkeypatch.setattr(availability, "utc_today", lambda: FixedDate(2026, 4, 21)) - - result = availability.lagged_latest_available( - dataset={ - "period_type": "daily", - "sync": {"availability": {"lag_days": 2}}, - }, - requested_end="2026-04-21", - ) - - assert result == "2026-04-19" - - -def test_lagged_latest_available_formats_yearly_offset(monkeypatch: pytest.MonkeyPatch) -> None: - class FixedDate(date): - @classmethod - def today(cls) -> "FixedDate": - return cls(2026, 4, 21) - - monkeypatch.setattr(availability, "utc_today", lambda: FixedDate(2026, 4, 21)) - - result = availability.lagged_latest_available( - dataset={ - "period_type": "yearly", - "sync": {"availability": {"latest_year_offset": 1}}, - }, - requested_end="2028", - ) - - assert result == "2025" From e3b6992396ae0ff83531c5c78b3b462ae8459714 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 01:10:07 +0200 Subject: [PATCH 31/80] =?UTF-8?q?refactor:=20delete=20climate=5Fapi/provid?= =?UTF-8?q?ers=20package=20=E2=80=94=20no=20longer=20contains=20anything?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The providers/ package contained only availability.py (deleted in the prior commit) and an empty __init__.py. Remove the package entirely and update the one test fixture that referenced the old module path. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/providers/__init__.py | 1 - tests/test_dataset_registry.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) delete mode 100644 climate_api/providers/__init__.py diff --git a/climate_api/providers/__init__.py b/climate_api/providers/__init__.py deleted file mode 100644 index 179fb70c..00000000 --- a/climate_api/providers/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Provider-specific Climate API helpers.""" diff --git a/tests/test_dataset_registry.py b/tests/test_dataset_registry.py index 2bd4bbf7..e4baf98e 100644 --- a/tests/test_dataset_registry.py +++ b/tests/test_dataset_registry.py @@ -178,7 +178,7 @@ def test_dataset_registry_accepts_sync_availability_function( sync: kind: temporal availability: - latest_available_function: climate_api.providers.availability.lagged_latest_available + latest_available_function: some.module.latest_available ingestion: plugin: some.ingest.Plugin """, @@ -187,5 +187,5 @@ def test_dataset_registry_accepts_sync_availability_function( monkeypatch.setattr(datasets, "CONFIGS_DIR", tmp_path) assert datasets.list_datasets()[0]["sync"]["availability"]["latest_available_function"].endswith( - "lagged_latest_available" + "latest_available" ) From 36984d1df5aace29148b76ba5937f584d68d1384 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 01:18:26 +0200 Subject: [PATCH 32/80] =?UTF-8?q?fix:=20address=20fourth=20Copilot=20PR=20?= =?UTF-8?q?review=20=E2=80=94=20task=20leak,=20GeoZarr=20attrs,=20Protocol?= =?UTF-8?q?,=20unused=20param?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - orchestrator: wrap task loop in try/except BaseException to cancel remaining tasks on any exception (network error, decode error, CancelledError) - orchestrator: write proj:code and spatial:bbox root attrs on the first mode='w' write so /zarr listings return correct CRS and bounds for Icechunk stores - protocol: remove rechunk_time from Protocol member list — Python 3.13 isinstance checks non-callable protocol attrs, so plugins that omit rechunk_time would fail load_plugin's isinstance guard; services.py already reads it via getattr - services: remove unused publish param from _upsert_icechunk_artifact_record; publishing is handled by the caller after the upsert Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/orchestrator.py | 109 +++++++++++++++++------------ climate_api/ingest/protocol.py | 14 ++-- climate_api/ingestions/services.py | 4 +- tests/test_dataset_registry.py | 4 +- tests/test_datasets_sync.py | 2 +- 5 files changed, 76 insertions(+), 57 deletions(-) diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py index 4365973f..118fd69d 100644 --- a/climate_api/ingest/orchestrator.py +++ b/climate_api/ingest/orchestrator.py @@ -46,6 +46,19 @@ def _strip_cf_encoding(ds: xr.Dataset, period_type: str) -> None: ds["time"].encoding.update({"units": units, "dtype": "int32"}) +def _write_geozarr_attrs(store: Any, *, spec: GridSpec, bbox: list[float]) -> None: + """Write GeoZarr root-level attributes to the store after the first mode='w' write.""" + import zarr + + root = zarr.open_group(store, mode="r+") + attrs: dict[str, Any] = { + "proj:code": f"EPSG:{spec.crs}", + "spatial:bbox": bbox, + } + attrs.update(spec.attrs) + root.attrs.update(attrs) + + def load_plugin( dotted_path: str, params: dict[str, Any], @@ -154,52 +167,58 @@ async def _fetch(period_id: str) -> xr.Dataset: # Await in chronological order so writes are always sequential. tasks = [asyncio.create_task(_fetch(p)) for p in pending] - for i, task in enumerate(tasks): - if is_cancel_requested and is_cancel_requested(): - for t in tasks[i:]: - t.cancel() - from climate_api.jobs.models import JobCancelledError - - raise JobCancelledError("Ingest cancelled between periods") - - ds = await task - period_id = pending[i] - if apply_transforms is not None: - ds = apply_transforms(ds) - _strip_cf_encoding(ds, period_type=period_type) - - # Each period uses its own writable session so that to_zarr(append_dim=) - # on the next period reads the committed store and finds the time axis. - # Icechunk 2.x sessions do not expose uncommitted writes to subsequent - # zarr.open_group calls, so batching writes within one session breaks the - # append — committing per period is the correct pattern. - session = repo.writable_session("main") - - if not spec.time_dim: - ds.to_zarr(session.store, mode="w") - elif i == 0 and is_first_write: - ds.to_zarr(session.store, mode="w") - else: - ds.to_zarr(session.store, append_dim="time") - - session.commit(f"ingest: {period_id}") - - # Save cursor at commit_batch_size intervals and at the end. - # commit_batch_size controls resume granularity (cursor save frequency), - # not commit frequency — every period is committed for correctness. - if save_cursor and ((i + 1) % plugin.commit_batch_size == 0 or (i + 1) == len(pending)): - save_cursor({"last_committed": period_id}) - logger.info("Cursor saved: up to %s (%d/%d)", period_id, i + 1, len(pending)) - - logger.debug("Committed: %s (%d/%d)", period_id, i + 1, len(pending)) - - if on_progress: - on_progress(done=done_offset + i + 1, total=len(all_periods), message=f"Wrote {period_id}") - - if not spec.time_dim: - for t in tasks[i + 1 :]: + try: + for i, task in enumerate(tasks): + if is_cancel_requested and is_cancel_requested(): + for t in tasks[i:]: + t.cancel() + from climate_api.jobs.models import JobCancelledError + + raise JobCancelledError("Ingest cancelled between periods") + + ds = await task + period_id = pending[i] + if apply_transforms is not None: + ds = apply_transforms(ds) + _strip_cf_encoding(ds, period_type=period_type) + + # Each period uses its own writable session so that to_zarr(append_dim=) + # on the next period reads the committed store and finds the time axis. + # Icechunk 2.x sessions do not expose uncommitted writes to subsequent + # zarr.open_group calls, so batching writes within one session breaks the + # append — committing per period is the correct pattern. + session = repo.writable_session("main") + + is_first_period_write = not spec.time_dim or (i == 0 and is_first_write) + if is_first_period_write: + ds.to_zarr(session.store, mode="w") + _write_geozarr_attrs(session.store, spec=spec, bbox=bbox) + else: + ds.to_zarr(session.store, append_dim="time") + + session.commit(f"ingest: {period_id}") + + # Save cursor at commit_batch_size intervals and at the end. + # commit_batch_size controls resume granularity (cursor save frequency), + # not commit frequency — every period is committed for correctness. + if save_cursor and ((i + 1) % plugin.commit_batch_size == 0 or (i + 1) == len(pending)): + save_cursor({"last_committed": period_id}) + logger.info("Cursor saved: up to %s (%d/%d)", period_id, i + 1, len(pending)) + + logger.debug("Committed: %s (%d/%d)", period_id, i + 1, len(pending)) + + if on_progress: + on_progress(done=done_offset + i + 1, total=len(all_periods), message=f"Wrote {period_id}") + + if not spec.time_dim: + for t in tasks[i + 1 :]: + t.cancel() + break + except BaseException: + for t in tasks: + if not t.done(): t.cancel() - break + raise if rechunk_time is not None and spec.time_dim: logger.info("Rechunking %s after ingest: time chunk → %d", store_path, rechunk_time) diff --git a/climate_api/ingest/protocol.py b/climate_api/ingest/protocol.py index d5c6e16e..ac232585 100644 --- a/climate_api/ingest/protocol.py +++ b/climate_api/ingest/protocol.py @@ -55,16 +55,18 @@ class IngestionPlugin(Protocol): restart resumes from the last checkpoint rather than re-scanning the store. Use 1 for monthly sources, ~30 for daily, ~720 for hourly. - rechunk_time: optional target time chunk size for the post-ingest rechunk. - When set, the orchestrator rewrites the store after all periods are - committed so the time axis uses chunks of this size instead of the - per-period chunk-of-1. Omit (or set to None) to skip rechunking. - Typical values: 30 for daily, 720 for hourly. + rechunk_time (optional class attribute): target time chunk size for the + post-ingest rechunk. When set, the orchestrator rewrites the store after + all periods are committed so the time axis uses chunks of this size + instead of the per-period chunk-of-1. Declare as a class attribute + (``rechunk_time: int | None = None``) to skip rechunking, or set to a + positive int (30 for daily, 720 for hourly). This attribute is read via + ``getattr`` and is intentionally excluded from the Protocol so that + plugins that omit it still pass the ``isinstance`` check. """ max_concurrency: int commit_batch_size: int - rechunk_time: int | None async def probe(self, bbox: list[float], **params: Any) -> GridSpec: """Metadata-only source probe. Returns grid spec. No data transfer.""" diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index d6fce283..76709865 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -386,7 +386,7 @@ def _create_icechunk_artifact( created_at=datetime.now(UTC), publication=ArtifactPublication(), ) - stored = _upsert_icechunk_artifact_record(record, publish=publish) + stored = _upsert_icechunk_artifact_record(record) logger.info( "Stored Icechunk artifact '%s' for '%s': coverage=%s..%s", stored.artifact_id, @@ -769,7 +769,7 @@ def mutate(records: list[ArtifactRecord]) -> ArtifactRecord: return _mutate_records(mutate) -def _upsert_icechunk_artifact_record(record: ArtifactRecord, *, publish: bool) -> ArtifactRecord: +def _upsert_icechunk_artifact_record(record: ArtifactRecord) -> ArtifactRecord: """Persist an Icechunk artifact record, replacing any existing record for the same store path. Matches by dataset_id + path rather than request_scope so that sync appends diff --git a/tests/test_dataset_registry.py b/tests/test_dataset_registry.py index e4baf98e..58ffabd9 100644 --- a/tests/test_dataset_registry.py +++ b/tests/test_dataset_registry.py @@ -186,6 +186,4 @@ def test_dataset_registry_accepts_sync_availability_function( ) monkeypatch.setattr(datasets, "CONFIGS_DIR", tmp_path) - assert datasets.list_datasets()[0]["sync"]["availability"]["latest_available_function"].endswith( - "latest_available" - ) + assert datasets.list_datasets()[0]["sync"]["availability"]["latest_available_function"].endswith("latest_available") diff --git a/tests/test_datasets_sync.py b/tests/test_datasets_sync.py index 0db02e0c..2a3a3694 100644 --- a/tests/test_datasets_sync.py +++ b/tests/test_datasets_sync.py @@ -1025,7 +1025,7 @@ def fake_run_ingest_sync(**kwargs: object) -> None: monkeypatch.setattr(svc, "get_extent", lambda: None) monkeypatch.setattr(svc.downloader, "DOWNLOAD_DIR", tmp_path) monkeypatch.setattr(svc, "_store_artifact_record", lambda record, **_: record) - monkeypatch.setattr(svc, "_upsert_icechunk_artifact_record", lambda record, **_: record) + monkeypatch.setattr(svc, "_upsert_icechunk_artifact_record", lambda record: record) class _FakeRepo: From f56983710271419f0f791efc60f92fa7bfde2ba0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 01:33:47 +0200 Subject: [PATCH 33/80] feat: multiscale pyramid support for Icechunk ingest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add build_pyramid_store() to ingest/store.py: loads the committed flat store, calls topozarr.create_pyramid() with auto-computed level count (same 512-pixel tile target / 2048×2048 threshold as the legacy downloader), and writes back to Icechunk as a DataTree (0/, 1/, 2/, ...). topozarr writes proj:code, spatial:bbox, and zarr_conventions multiscales metadata to root — /zarr listings and STAC already use these attrs. Wire pyramid into the orchestrator as a post-ingest step (after rechunk, before expire_snapshots) via a plugin class attribute pyramid: bool. Services passes pyramid=getattr(plugin, 'pyramid', False); initial ingest only (sync appends skip pyramid rebuild like they skip rechunking). WorldPopPlugin sets pyramid = True — high-resolution global population grids need overviews for efficient browser-based map rendering. Supporting changes: - read_committed_period_ids: falls back to group="0" for pyramid stores - open_icechunk_dataset: opens group "0" when multiscales present in root attrs - stac/_is_icechunk_pyramid: detects pyramid via root zarr attrs - stac/_public_zarr_asset_href: routes Icechunk pyramids to /zarr/{id}/0 - IngestionPlugin docstring: documents pyramid optional class attribute Co-Authored-By: Claude Sonnet 4.6 --- .../data_accessor/services/accessor.py | 11 +++- climate_api/ingest/orchestrator.py | 11 +++- climate_api/ingest/plugins/worldpop.py | 1 + climate_api/ingest/protocol.py | 7 +++ climate_api/ingest/store.py | 50 +++++++++++++++++++ climate_api/ingestions/services.py | 5 +- climate_api/stac/services.py | 20 +++++++- 7 files changed, 99 insertions(+), 6 deletions(-) diff --git a/climate_api/data_accessor/services/accessor.py b/climate_api/data_accessor/services/accessor.py index 9d4faa92..4635c7d6 100644 --- a/climate_api/data_accessor/services/accessor.py +++ b/climate_api/data_accessor/services/accessor.py @@ -126,8 +126,13 @@ def open_zarr_dataset(zarr_path: str) -> xr.Dataset: def open_icechunk_dataset(store_path: str | Path) -> xr.Dataset: - """Open an Icechunk store as an xarray Dataset via a readonly MVCC session.""" + """Open an Icechunk store as an xarray Dataset via a readonly MVCC session. + + Detects multiscale pyramid stores (root group has ``multiscales`` in attrs) + and opens group ``0`` (full resolution) in that case. + """ import icechunk + import zarr path = Path(store_path) if not path.exists(): @@ -135,7 +140,9 @@ def open_icechunk_dataset(store_path: str | Path) -> xr.Dataset: storage = icechunk.local_filesystem_storage(str(path)) repo = icechunk.Repository.open(storage) session = repo.readonly_session("main") - return xr.open_zarr(session.store) # type: ignore[no-any-return] + root = zarr.open_group(session.store, mode="r") + group: str | None = "0" if "multiscales" in root.attrs else None + return xr.open_zarr(session.store, group=group) # type: ignore[no-any-return] def _open_zarr(zarr_path: str) -> xr.Dataset: diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py index 118fd69d..5e145dfa 100644 --- a/climate_api/ingest/orchestrator.py +++ b/climate_api/ingest/orchestrator.py @@ -24,7 +24,7 @@ import xarray as xr from climate_api.ingest.protocol import GridSpec, IngestionPlugin -from climate_api.ingest.store import open_or_create_repo, read_committed_period_ids, rechunk_store +from climate_api.ingest.store import build_pyramid_store, open_or_create_repo, read_committed_period_ids, rechunk_store logger = logging.getLogger(__name__) @@ -107,6 +107,7 @@ async def run_ingest( load_cursor: Callable[[], dict[str, Any] | None] | None = None, rechunk_time: int | None = None, apply_transforms: Callable[[xr.Dataset], xr.Dataset] | None = None, + pyramid: bool = False, ) -> None: """Probe the source then stream per-period data into an Icechunk store. @@ -223,7 +224,11 @@ async def _fetch(period_id: str) -> xr.Dataset: if rechunk_time is not None and spec.time_dim: logger.info("Rechunking %s after ingest: time chunk → %d", store_path, rechunk_time) rechunk_store(store_path, time_chunk=rechunk_time) - # Reopen repo so expire_snapshots sees the post-rechunk HEAD. + repo = open_or_create_repo(store_path) + + if pyramid: + build_pyramid_store(store_path, x_dim=spec.x_dim, y_dim=spec.y_dim) + # Reopen repo so expire_snapshots sees the post-pyramid HEAD. repo = open_or_create_repo(store_path) # Prune intermediate ingest snapshots: each period commit created one @@ -254,6 +259,7 @@ def run_ingest_sync( load_cursor: Callable[[], dict[str, Any] | None] | None = None, rechunk_time: int | None = None, apply_transforms: Callable[[xr.Dataset], xr.Dataset] | None = None, + pyramid: bool = False, ) -> None: """Synchronous wrapper around run_ingest for use in threaded job workers.""" asyncio.run( @@ -271,5 +277,6 @@ def run_ingest_sync( load_cursor=load_cursor, rechunk_time=rechunk_time, apply_transforms=apply_transforms, + pyramid=pyramid, ) ) diff --git a/climate_api/ingest/plugins/worldpop.py b/climate_api/ingest/plugins/worldpop.py index 305c8ec1..8a28b1e1 100644 --- a/climate_api/ingest/plugins/worldpop.py +++ b/climate_api/ingest/plugins/worldpop.py @@ -46,6 +46,7 @@ class WorldPopPlugin: max_concurrency = 1 commit_batch_size = 1 rechunk_time: int | None = None + pyramid: bool = True def __init__(self, country_code: str = "", version: str = "global2") -> None: self.country_code = country_code.upper() diff --git a/climate_api/ingest/protocol.py b/climate_api/ingest/protocol.py index ac232585..15bdeba1 100644 --- a/climate_api/ingest/protocol.py +++ b/climate_api/ingest/protocol.py @@ -63,6 +63,13 @@ class IngestionPlugin(Protocol): positive int (30 for daily, 720 for hourly). This attribute is read via ``getattr`` and is intentionally excluded from the Protocol so that plugins that omit it still pass the ``isinstance`` check. + + pyramid (optional class attribute): when ``True``, the orchestrator builds + a multiscale pyramid after ingest completes. Level count is derived + automatically from the spatial dimensions (same 512-pixel tile target + and 2048×2048 threshold as the legacy downloader). Set on plugins whose + data resolution produces tiles too large for efficient browser rendering + without overviews. Like ``rechunk_time``, read via ``getattr``. """ max_concurrency: int diff --git a/climate_api/ingest/store.py b/climate_api/ingest/store.py index b3a38540..efbe4a8b 100644 --- a/climate_api/ingest/store.py +++ b/climate_api/ingest/store.py @@ -3,12 +3,17 @@ from __future__ import annotations import logging +import math from pathlib import Path from typing import TYPE_CHECKING if TYPE_CHECKING: import icechunk +_PYRAMID_PIXEL_THRESHOLD = 2048 * 2048 +_PYRAMID_TARGET_TILE_SIZE = 512 +_PYRAMID_MAX_LEVELS = 8 + logger = logging.getLogger(__name__) @@ -72,6 +77,45 @@ def rechunk_store(store_path: Path, *, time_chunk: int) -> None: ds.close() +def build_pyramid_store(store_path: Path, *, x_dim: str = "x", y_dim: str = "y") -> None: + """Rewrite the committed Icechunk store as a multiscale pyramid. + + Level count is derived from the actual spatial dimensions using the same + 512-pixel tile target as the legacy downloader. A no-op when the store does + not exist or its spatial extent is below the 2048×2048 threshold. + + The pyramid commit replaces the flat root structure: data moves from root + to ``0/`` and coarsened overviews are written to ``1/``, ``2/``, etc. + Intermediate ingest snapshots are left for the orchestrator's + expire_snapshots call to prune. + """ + import xarray as xr + from topozarr import create_pyramid + + if not store_path.exists(): + return + + repo = open_or_create_repo(store_path) + read_session = repo.readonly_session("main") + ds = xr.open_zarr(read_session.store) + try: + nx = ds.sizes.get(x_dim, 0) + ny = ds.sizes.get(y_dim, 0) + if nx * ny <= _PYRAMID_PIXEL_THRESHOLD: + logger.info("Skipping pyramid for %s: %dx%d below threshold", store_path, nx, ny) + return + levels = min(math.ceil(math.log2(max(nx, ny) / _PYRAMID_TARGET_TILE_SIZE)), _PYRAMID_MAX_LEVELS) + ds_loaded = ds.load() + finally: + ds.close() + + pyramid = create_pyramid(ds_loaded, levels=levels, x_dim=x_dim, y_dim=y_dim) + write_session = repo.writable_session("main") + pyramid.dt.to_zarr(write_session.store, mode="w", encoding=pyramid.encoding, zarr_format=3) + write_session.commit(f"pyramid: {levels} levels") + logger.info("Built %d-level pyramid for %s (%dx%d)", levels, store_path, nx, ny) + + def read_committed_period_ids(store_path: Path, period_type: str) -> set[str]: """Return the set of period IDs already committed to the Icechunk store. @@ -91,6 +135,12 @@ def read_committed_period_ids(store_path: Path, period_type: str) -> set[str]: session = repo.readonly_session("main") ds = xr.open_zarr(session.store) try: + if "time" not in ds.coords: + # Pyramid store: time lives in level "0", not at root. + if "multiscales" not in ds.attrs: + return set() + ds.close() + ds = xr.open_zarr(session.store, group="0") if "time" not in ds.coords: return set() import pandas as pd diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 76709865..73f4f4a3 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -308,14 +308,16 @@ def _create_icechunk_artifact( # plugin's declared rechunk_time, if any. Sync appends skip rechunking to avoid # rewriting the full store on every small update. rechunk_time: int | None = getattr(plugin, "rechunk_time", None) if ingest_start is None else None + pyramid: bool = bool(getattr(plugin, "pyramid", False)) if ingest_start is None else False logger.info( - "Running Icechunk ingest for '%s': ingest_scope=%s..%s artifact_scope=%s..%s rechunk_time=%s", + "Running Icechunk ingest for '%s': ingest_scope=%s..%s artifact_scope=%s..%s rechunk_time=%s pyramid=%s", dataset_id, effective_start, end, start, end, rechunk_time, + pyramid, ) transforms = dataset.get("transforms") apply_transforms = (lambda ds: downloader._run_transforms(ds, dataset)) if transforms else None @@ -329,6 +331,7 @@ def _create_icechunk_artifact( period_type=period_type, rechunk_time=rechunk_time, apply_transforms=apply_transforms, + pyramid=pyramid, ) if not store_path.exists(): diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index 03177786..ed6cf329 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -313,7 +313,10 @@ def _public_zarr_asset_href( source_dataset: dict[str, Any], ) -> str: artifact_path = _artifact_store_path(artifact) - if _is_pyramid_zarr(artifact_path): + if artifact.format == ArtifactFormat.ICECHUNK: + if _is_icechunk_pyramid(artifact_path): + return _abs_url(request, f"/zarr/{dataset_id}/0") + elif _is_pyramid_zarr(artifact_path): return _abs_url(request, f"/zarr/{dataset_id}/0") return _abs_url(request, f"/zarr/{dataset_id}") @@ -325,6 +328,21 @@ def _is_pyramid_zarr(artifact_path: str) -> bool: return (Path(artifact_path) / "0").is_dir() +def _is_icechunk_pyramid(store_path: str) -> bool: + """Return True if the Icechunk store contains a multiscale pyramid.""" + try: + import zarr + + from climate_api.ingest.store import open_or_create_repo + + repo = open_or_create_repo(Path(store_path)) + session = repo.readonly_session("main") + root = zarr.open_group(session.store, mode="r") + return "multiscales" in root.attrs + except Exception: + return False + + def _abs_url(request: Request, path: str) -> str: base_url = os.getenv("CLIMATE_API_BASE_URL") if base_url: From 122f03884f85d6a8f65a1d77fc36ff5819d10c06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 01:35:30 +0200 Subject: [PATCH 34/80] =?UTF-8?q?chore:=20remove=20dhis2eo=20dependency=20?= =?UTF-8?q?=E2=80=94=20never=20imported?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dhis2eo was listed as a runtime dependency but is not imported anywhere in the codebase. Only reference is a comment in an integration test explaining it is incompatible with the pinned xarray stack. Co-Authored-By: Claude Sonnet 4.6 --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5ad36fe3..0b7fc467 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,6 @@ dependencies = [ "topozarr==0.0.*", "rioxarray>=0.17", "portalocker>=3.2.0", - "dhis2eo>=1.2.1", "icechunk>=2.0,<3", ] From 37f8e743959730fc18cf28c4d7227a71d95cdca5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 01:43:47 +0200 Subject: [PATCH 35/80] refactor: remove legacy build_dataset_zarr and /build_zarr HTTP route All built-in datasets (ERA5-Land, CHIRPS3, WorldPop) now ingest via the IngestionPlugin/Icechunk path; the NetCDF-cache-to-zarr conversion function and its unregistered FastAPI route are no longer reachable. Retains DOWNLOAD_DIR, get_cache_files, get_zarr_path, _run_transforms and _get_dynamic_function which are still used by the accessor and ingestion services for legacy zarr fallback and YAML transform dispatch. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/data_manager/__init__.py | 1 - climate_api/data_manager/routes.py | 24 -- .../data_manager/services/downloader.py | 197 +------------ tests/test_downloader.py | 269 +----------------- 4 files changed, 4 insertions(+), 487 deletions(-) delete mode 100644 climate_api/data_manager/routes.py diff --git a/climate_api/data_manager/__init__.py b/climate_api/data_manager/__init__.py index 186a758f..1331b20f 100644 --- a/climate_api/data_manager/__init__.py +++ b/climate_api/data_manager/__init__.py @@ -1,4 +1,3 @@ """Data manager package.""" -from . import routes as routes from . import services as services diff --git a/climate_api/data_manager/routes.py b/climate_api/data_manager/routes.py deleted file mode 100644 index e69bdeef..00000000 --- a/climate_api/data_manager/routes.py +++ /dev/null @@ -1,24 +0,0 @@ -"""FastAPI router exposing dataset endpoints.""" - -from fastapi import APIRouter, BackgroundTasks - -from ..data_registry.routes import _get_dataset_or_404 -from .services import downloader - -router = APIRouter() - - -@router.get( - "/{dataset_id}/build_zarr", - response_model=dict, - summary="Internal dataset Zarr build", - include_in_schema=False, -) -def build_dataset_zarr( - dataset_id: str, - background_tasks: BackgroundTasks, -) -> dict[str, str]: - """Internal low-level cache optimization route kept for compatibility.""" - dataset = _get_dataset_or_404(dataset_id) - background_tasks.add_task(downloader.build_dataset_zarr, dataset) - return {"status": "Building zarr file from dataset downloads"} diff --git a/climate_api/data_manager/services/downloader.py b/climate_api/data_manager/services/downloader.py index b525e288..3d7ed233 100644 --- a/climate_api/data_manager/services/downloader.py +++ b/climate_api/data_manager/services/downloader.py @@ -1,23 +1,15 @@ -"""Dataset cache: build and optimize raster data as local Zarr stores.""" +"""Dataset cache: utilities for locating and reading downloaded raster files.""" import importlib import logging import os -import shutil from collections.abc import Callable from pathlib import Path from typing import Any import xarray as xr -import xproj # noqa: F401 # type: ignore[import-untyped] # pyright: ignore[reportUnusedImport] -from geozarr_toolkit import MultiscalesConventionMetadata, create_geozarr_attrs -from topozarr.coarsen import create_pyramid from climate_api import config as api_config -from climate_api.shared.time import resolve_iso_period_step, time_chunk_for_iso_step -from climate_api.transforms.reproject import reproject_to_instance_crs - -from .utils import get_time_dim, get_x_y_dims logger = logging.getLogger(__name__) @@ -33,158 +25,6 @@ def _resolve_download_dir() -> Path: DOWNLOAD_DIR = _resolve_download_dir() -def build_dataset_zarr(dataset: dict[str, Any], *, start: str | None = None, end: str | None = None) -> None: - """Collect dataset cache files into one optimised Zarr archive, clipped to request scope.""" - logger.info(f"Optimizing cache for dataset {dataset['id']}") - - files = get_cache_files(dataset) - logger.info(f"Opening {len(files)} files from cache") - ds = xr.open_mfdataset(files, parallel=False) - - x_dim, y_dim = get_x_y_dims(ds) - dims = [x_dim, y_dim] - - # trim to only minimal vars and coords before loading into memory - logger.info("Trimming unnecessary variables and coordinates") - varname = dataset["variable"] - ds = ds[[varname]] - keep_coords = [get_time_dim(ds)] + dims - drop_coords = [c for c in ds.coords if c not in keep_coords] - ds = ds.drop_vars(drop_coords) - - # Normalise to canonical names so all stored Zarr files are consistent. - crs = api_config.get_crs() - time_dim = get_time_dim(ds) - rename_map = {k: v for k, v in [(time_dim, "time"), (x_dim, "x"), (y_dim, "y")] if k != v} - if rename_map: - ds = ds.rename(rename_map) - x_dim, y_dim = "x", "y" - dims = [x_dim, y_dim] - - ds = _select_time_range(ds, dataset=dataset, start=start, end=end) - ds = _run_transforms(ds, dataset) - - source_crs: str = dataset.get("source_crs", "EPSG:4326") - ds = reproject_to_instance_crs(ds, dataset, source_crs=source_crs) - - xmin = ds[x_dim].min().item() - xmax = ds[x_dim].max().item() - ymin = ds[y_dim].min().item() - ymax = ds[y_dim].max().item() - bbox = [xmin, ymin, xmax, ymax] - shape = (ds.sizes[x_dim], ds.sizes[y_dim]) - - # https://github.com/zarr-developers/geozarr-toolkit/issues/15 - geozarr_attrs = create_geozarr_attrs( - dimensions=dims, - crs=crs, - bbox=bbox, - shape=shape, - ) - - # save as zarr - logger.info("Saving to optimized zarr file") - zarr_path = DOWNLOAD_DIR / f"{_get_cache_prefix(dataset)}.zarr" - - if _needs_pyramid(ds, x_dim, y_dim): - levels = _pyramid_levels(ds, x_dim, y_dim) - logger.info("Building %d-level pyramid (max dim %d pixels)", levels, max(ds.sizes[x_dim], ds.sizes[y_dim])) - - # Add multiscales convention metadata to the zarr attributes - zarr_conventions = geozarr_attrs.get("zarr_conventions", []) - zarr_conventions.append(MultiscalesConventionMetadata().model_dump()) - geozarr_attrs["zarr_conventions"] = zarr_conventions - - # Load into memory then close to deterministically release netCDF file handles - # before create_pyramid spawns multiprocessing workers. After load() the data - # lives in numpy arrays and no longer needs the underlying file objects. - ds.load() - ds.close() - - ds = ds.proj.assign_crs(spatial_ref=crs) - - # https://github.com/carbonplan/topozarr/issues/13 - pyramid = create_pyramid(ds, levels=levels, x_dim=x_dim, y_dim=y_dim, method="mean") - - pyramid.dt.attrs.update(geozarr_attrs) - pyramid.dt.to_zarr(zarr_path, mode="w", encoding=pyramid.encoding, zarr_format=3) - - # zarr-layer looks for the time coordinate at the root of the store, not inside each level. - # Copy it from level 0 so browser clients can discover it without knowing the level structure. - time_dim = get_time_dim(ds) - time_src = zarr_path / "0" / time_dim - time_dst = zarr_path / time_dim - if time_src.exists(): - if time_dst.exists(): - shutil.rmtree(time_dst) - shutil.copytree(time_src, time_dst) - - pyramid.dt.close() - - else: - logger.info("Building flat zarr (max dim %d pixels)", max(ds.sizes[x_dim], ds.sizes[y_dim])) - uniform_chunks = _compute_time_space_chunks(ds, dataset) - logger.info(f"--> {uniform_chunks}") - - ds.attrs.update(geozarr_attrs) - ds_chunked = ds.chunk(uniform_chunks) - # Remove _FillValue from each variable's encoding so that in-memory NaN values - # are stored as IEEE NaN in zarr rather than re-encoded as a sentinel (e.g. - # -999.99). ZarrLayer uses the zarr fill_value attribute (nan for floats) to - # render missing pixels as transparent — not a separately specified fillValue. - for var in ds_chunked.data_vars: - ds_chunked[var].encoding.pop("_FillValue", None) - ds_chunked.to_zarr(zarr_path, mode="w", zarr_format=3, consolidated=True) - ds_chunked.close() - - ds.close() - logger.info("Finished cache optimization") - - -_PYRAMID_PIXEL_THRESHOLD = 2048 * 2048 -_PYRAMID_MAX_LEVELS = 8 -_PYRAMID_TARGET_TILE_SIZE = 512 - - -def _needs_pyramid(ds: xr.Dataset, x_dim: str, y_dim: str) -> bool: - """Return True when the spatial extent is large enough to benefit from a pyramid.""" - return ds.sizes[x_dim] * ds.sizes[y_dim] > _PYRAMID_PIXEL_THRESHOLD - - -def _pyramid_levels(ds: xr.Dataset, x_dim: str, y_dim: str) -> int: - """Compute the number of pyramid levels needed to reach a manageable tile size.""" - import math - - max_dim = max(ds.sizes[x_dim], ds.sizes[y_dim]) - levels = math.ceil(math.log2(max_dim / _PYRAMID_TARGET_TILE_SIZE)) - return max(2, min(levels, _PYRAMID_MAX_LEVELS)) - - -def _select_time_range( - ds: xr.Dataset, - *, - dataset: dict[str, Any], - start: str | None, - end: str | None, -) -> xr.Dataset: - """Clip a cached dataset to the managed artifact's requested temporal scope.""" - if start is None and end is None: - return ds - - time_dim = get_time_dim(ds) - selected = ds.sel({time_dim: slice(start, end)}) - if selected.sizes.get(time_dim, 0) == 0: - raise ValueError(f"No cached data for dataset '{dataset['id']}' intersects requested time range {start}..{end}") - logger.info( - "Clipped dataset '%s' to requested time range %s..%s (%d steps)", - dataset["id"], - start, - end, - selected.sizes[time_dim], - ) - return selected - - def _run_transforms(ds: xr.Dataset, dataset: dict[str, Any]) -> xr.Dataset: dataset_id = dataset.get("id", "?") for entry in dataset.get("transforms", []): @@ -208,41 +48,6 @@ def _run_transforms(ds: xr.Dataset, dataset: dict[str, Any]) -> xr.Dataset: return ds -def _compute_time_space_chunks( - ds: xr.Dataset, - dataset: dict[str, Any], - max_spatial_chunk: int = 512, -) -> dict[str, int]: - """Compute chunk sizes tuned for common temporal access patterns.""" - chunks: dict[str, int] = {} - - iso_step = resolve_iso_period_step(dataset) - dim = get_time_dim(ds) - if iso_step is not None: - try: - chunks[dim] = time_chunk_for_iso_step(iso_step) - except ValueError: - logger.warning( - "Invalid ISO 8601 step %r for dataset '%s'; defaulting time chunk to 12.", - iso_step, - dataset.get("id", "?"), - ) - chunks[dim] = 12 - else: - logger.warning( - "No ISO 8601 step for dataset '%s'; defaulting time chunk to 12. " - "Declare 'extents.temporal.resolution' in the template to silence this warning.", - dataset.get("id", "?"), - ) - chunks[dim] = 12 - - x_dim, y_dim = get_x_y_dims(ds) - chunks[x_dim] = min(ds.sizes[x_dim], max_spatial_chunk) - chunks[y_dim] = min(ds.sizes[y_dim], max_spatial_chunk) - - return chunks - - def _get_cache_prefix(dataset: dict[str, Any]) -> str: return str(dataset["id"]) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index d95e9611..5b2865c6 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -7,8 +7,6 @@ import pytest import xarray as xr import zarr -from topozarr.pyramid import Pyramid -from xarray import DataTree from climate_api.data_accessor.services.accessor import _coverage_from_dataset, open_zarr_dataset from climate_api.data_manager.services import downloader @@ -71,52 +69,6 @@ def _make_dataset() -> xr.Dataset: ) -def _write_nc_files(tmp_path: Path) -> list[Path]: - paths = [] - for year in (2020, 2021): - ds = xr.Dataset( - {"pop_total": (["time", "lat", "lon"], np.ones((1, 3, 3), dtype="float32"))}, - coords={ - "time": [pd.Timestamp(f"{year}-01-01")], - "lat": [10.0, 9.0, 8.0], - "lon": [30.0, 31.0, 32.0], - }, - ) - path = tmp_path / f"my_dataset_{year}.nc" - ds.to_netcdf(path) - paths.append(path) - return paths - - -def _write_daily_nc_file(tmp_path: Path) -> list[Path]: - ds = xr.Dataset( - {"precip": (["time", "lat", "lon"], np.ones((29, 3, 3), dtype="float32"))}, - coords={ - "time": pd.date_range("2024-02-01", "2024-02-29", freq="D"), - "lat": [10.0, 9.0, 8.0], - "lon": [30.0, 31.0, 32.0], - }, - ) - path = tmp_path / "chirps3_precipitation_daily_2024-02.nc" - ds.to_netcdf(path) - return [path] - - -_FLAT_DATASET: dict[str, Any] = { - "id": "my_dataset", - "variable": "pop_total", - "period_type": "yearly", - "ingestion": {}, -} - -_PYRAMID_DATASET: dict[str, Any] = { - "id": "my_dataset", - "variable": "pop_total", - "period_type": "yearly", - "ingestion": {}, -} - - # --------------------------------------------------------------------------- # open_zarr_dataset # --------------------------------------------------------------------------- @@ -151,18 +103,17 @@ def test_open_zarr_dataset_pyramid_falls_back_to_level_0(tmp_path: Path) -> None def test_open_zarr_dataset_pyramid_with_root_time_still_opens_level_0(tmp_path: Path) -> None: - """Root-level time coord (copied for zarr-layer) does not confuse the fallback. + """Root-level time coord does not confuse the fallback. The fallback triggers on empty data_vars, not empty dims, so a root group that only has a 'time' coordinate array still falls back to /0. """ + import shutil + ds = _make_dataset() zarr_path = tmp_path / "pyramid.zarr" zarr.open_group(str(zarr_path), mode="w", zarr_format=3) ds.to_zarr(str(zarr_path / "0"), mode="w", zarr_format=3) - # Simulate what build_dataset_zarr does: copy time to root - import shutil - shutil.copytree(str(zarr_path / "0" / "time"), str(zarr_path / "time")) result = open_zarr_dataset(str(zarr_path)) @@ -172,220 +123,6 @@ def test_open_zarr_dataset_pyramid_with_root_time_still_opens_level_0(tmp_path: result.close() -# --------------------------------------------------------------------------- -# build_dataset_zarr — flat path -# --------------------------------------------------------------------------- - - -def test_build_dataset_zarr_flat_creates_zarr(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """Flat zarr is written with the correct variable and no pyramid level dirs.""" - nc_files = _write_nc_files(tmp_path) - monkeypatch.setattr(downloader, "DOWNLOAD_DIR", tmp_path) - monkeypatch.setattr(downloader, "get_cache_files", lambda _: nc_files) - - downloader.build_dataset_zarr(_FLAT_DATASET) - - zarr_path = tmp_path / "my_dataset.zarr" - assert zarr_path.exists() - assert not (zarr_path / "0").exists() - - result = open_zarr_dataset(str(zarr_path)) - try: - assert "pop_total" in result.data_vars - assert result.sizes["time"] == 2 - finally: - result.close() - - -def test_build_dataset_zarr_normalises_coordinate_names(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """Source coordinates (lat/lon, valid_time) are renamed to x/y/time.""" - # Simulate ERA5-Land source with valid_time and lon/lat - ds_era5 = xr.Dataset( - {"t2m": (["valid_time", "lat", "lon"], np.ones((2, 3, 3), dtype="float32"))}, - coords={ - "valid_time": pd.date_range("2024-01-01", periods=2, freq="h"), - "lat": [10.0, 9.0, 8.0], - "lon": [30.0, 31.0, 32.0], - }, - ) - path = tmp_path / "era5_t2m_2024-01.nc" - ds_era5.to_netcdf(path) - - dataset: dict[str, Any] = { - "id": "era5land_temperature_hourly", - "variable": "t2m", - "period_type": "hourly", - "ingestion": {}, - } - monkeypatch.setattr(downloader, "DOWNLOAD_DIR", tmp_path) - monkeypatch.setattr(downloader, "get_cache_files", lambda _: [path]) - - downloader.build_dataset_zarr(dataset) - - result = open_zarr_dataset(str(tmp_path / "era5land_temperature_hourly.zarr")) - try: - assert "time" in result.coords - assert "x" in result.coords - assert "y" in result.coords - assert "valid_time" not in result.coords - assert "lat" not in result.coords - assert "lon" not in result.coords - finally: - result.close() - - -def test_build_dataset_zarr_normalises_xy_coordinate_names(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """Source coordinates already named x/y are preserved as x/y.""" - ds_xy = xr.Dataset( - {"precip": (["time", "y", "x"], np.ones((2, 3, 3), dtype="float32"))}, - coords={ - "time": pd.date_range("2024-01-01", periods=2, freq="D"), - "y": [10.0, 9.0, 8.0], - "x": [30.0, 31.0, 32.0], - }, - ) - path = tmp_path / "chirps_xy_2024-01.nc" - ds_xy.to_netcdf(path) - - dataset: dict[str, Any] = { - "id": "chirps3_precipitation_daily", - "variable": "precip", - "period_type": "daily", - "ingestion": {}, - } - monkeypatch.setattr(downloader, "DOWNLOAD_DIR", tmp_path) - monkeypatch.setattr(downloader, "get_cache_files", lambda _: [path]) - - downloader.build_dataset_zarr(dataset) - - result = open_zarr_dataset(str(tmp_path / "chirps3_precipitation_daily.zarr")) - try: - assert "time" in result.coords - assert "x" in result.coords - assert "y" in result.coords - finally: - result.close() - - -def test_build_dataset_zarr_clips_to_requested_daily_range( - tmp_path: Path, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Provider cache files may contain full months; canonical Zarr honors request scope.""" - nc_files = _write_daily_nc_file(tmp_path) - dataset: dict[str, Any] = { - "id": "chirps3_precipitation_daily", - "variable": "precip", - "period_type": "daily", - "ingestion": {}, - } - monkeypatch.setattr(downloader, "DOWNLOAD_DIR", tmp_path) - monkeypatch.setattr(downloader, "get_cache_files", lambda _: nc_files) - - downloader.build_dataset_zarr(dataset, start="2024-02-01", end="2024-02-10") - - result = open_zarr_dataset(str(tmp_path / "chirps3_precipitation_daily.zarr")) - try: - assert result.sizes["time"] == 10 - assert pd.Timestamp(result.time.min().item()).strftime("%Y-%m-%d") == "2024-02-01" - assert pd.Timestamp(result.time.max().item()).strftime("%Y-%m-%d") == "2024-02-10" - finally: - result.close() - - -# --------------------------------------------------------------------------- -# build_dataset_zarr — pyramid path -# --------------------------------------------------------------------------- - - -def _make_fake_pyramid(ds: xr.Dataset, zarr_path: Path) -> Pyramid: - """Return a Pyramid whose .dt.to_zarr writes a minimal two-level DataTree store.""" - level0 = ds - level1 = ds.coarsen(y=2, x=2, boundary="trim").mean() # pyright: ignore[reportAttributeAccessIssue] - dt = DataTree.from_dict({"0": level0, "1": level1}) - return Pyramid(datatree=dt, encoding={}) - - -def test_build_dataset_zarr_pyramid_copies_time_to_root(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """Pyramid zarr build copies the time coordinate to the store root for zarr-layer.""" - nc_files = _write_nc_files(tmp_path) - monkeypatch.setattr(downloader, "DOWNLOAD_DIR", tmp_path) - monkeypatch.setattr(downloader, "get_cache_files", lambda _: nc_files) - monkeypatch.setattr(downloader, "_needs_pyramid", lambda *_: True) - - def fake_create_pyramid(ds: xr.Dataset, levels: int, x_dim: str, y_dim: str, method: str) -> Pyramid: - return _make_fake_pyramid(ds, tmp_path / "my_dataset.zarr") - - monkeypatch.setattr(downloader, "create_pyramid", fake_create_pyramid) - - downloader.build_dataset_zarr(_PYRAMID_DATASET) - - zarr_path = tmp_path / "my_dataset.zarr" - assert (zarr_path / "0").exists(), "pyramid level 0 should exist" - assert (zarr_path / "time").exists(), "time coordinate must be copied to zarr root" - - -def test_build_dataset_zarr_pyramid_is_openable_via_level_0(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """open_zarr_dataset returns the dataset from level 0 of the pyramid store.""" - nc_files = _write_nc_files(tmp_path) - monkeypatch.setattr(downloader, "DOWNLOAD_DIR", tmp_path) - monkeypatch.setattr(downloader, "get_cache_files", lambda _: nc_files) - monkeypatch.setattr(downloader, "_needs_pyramid", lambda *_: True) - - def fake_create_pyramid(ds: xr.Dataset, levels: int, x_dim: str, y_dim: str, method: str) -> Pyramid: - return _make_fake_pyramid(ds, tmp_path / "my_dataset.zarr") - - monkeypatch.setattr(downloader, "create_pyramid", fake_create_pyramid) - - downloader.build_dataset_zarr(_PYRAMID_DATASET) - - result = open_zarr_dataset(str(tmp_path / "my_dataset.zarr")) - try: - assert "pop_total" in result.data_vars - assert result.sizes["time"] == 2 - finally: - result.close() - - -def test_build_dataset_zarr_pyramid_normalises_coordinate_names( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch -) -> None: - """Pyramid zarr store uses canonical x/y/time coordinate names.""" - # Source files use lat/lon (WorldPop-style); canonical names must appear in the written store. - nc_files = _write_nc_files(tmp_path) - monkeypatch.setattr(downloader, "DOWNLOAD_DIR", tmp_path) - monkeypatch.setattr(downloader, "get_cache_files", lambda _: nc_files) - monkeypatch.setattr(downloader, "_needs_pyramid", lambda *_: True) - - received: list[xr.Dataset] = [] - - def fake_create_pyramid(ds: xr.Dataset, levels: int, x_dim: str, y_dim: str, method: str) -> Pyramid: - received.append(ds) - return _make_fake_pyramid(ds, tmp_path / "my_dataset.zarr") - - monkeypatch.setattr(downloader, "create_pyramid", fake_create_pyramid) - - downloader.build_dataset_zarr(_PYRAMID_DATASET) - - # The dataset handed to create_pyramid must already carry canonical names. - assert len(received) == 1 - ds_in = received[0] - assert "x" in ds_in.coords - assert "y" in ds_in.coords - assert "time" in ds_in.coords - assert "lon" not in ds_in.coords - assert "lat" not in ds_in.coords - - # The written store must also expose canonical names when opened. - result = open_zarr_dataset(str(tmp_path / "my_dataset.zarr")) - try: - assert "x" in result.coords - assert "y" in result.coords - assert "time" in result.coords - finally: - result.close() - - # --------------------------------------------------------------------------- # _coverage_from_dataset — WGS84 reprojection # --------------------------------------------------------------------------- From 52ba79e24ebe34afc6ac6c0187b6074b0e5f72ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 01:51:25 +0200 Subject: [PATCH 36/80] refactor: remove legacy netcdf/flat-zarr code paths after Icechunk migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - get_data() now opens the Icechunk store directly via open_icechunk_dataset() instead of falling back through flat zarr → netcdf - get_data_coverage_for_paths() drops the netcdf_paths parameter; zarr_path is now required (used only by the resample path which writes flat zarr) - get_cache_files() and get_zarr_path() removed; replaced by get_icechunk_path() which consolidates the DOWNLOAD_DIR / "{id}.icechunk" convention in one place - _validate_sync_availability() removed — validated a YAML field that referenced the deleted climate_api.providers.availability module - reproject.py docstring updated: constraint on x/y names is now guaranteed by the ingest orchestrator, not the removed build_dataset_zarr function Co-Authored-By: Claude Sonnet 4.6 --- .../data_accessor/services/accessor.py | 40 +++------------ .../data_manager/services/downloader.py | 17 ++----- .../data_registry/services/datasets.py | 18 ------- climate_api/ingestions/services.py | 2 +- climate_api/transforms/reproject.py | 2 +- tests/test_dataset_registry.py | 49 ------------------- 6 files changed, 11 insertions(+), 117 deletions(-) diff --git a/climate_api/data_accessor/services/accessor.py b/climate_api/data_accessor/services/accessor.py index 4635c7d6..f1307398 100644 --- a/climate_api/data_accessor/services/accessor.py +++ b/climate_api/data_accessor/services/accessor.py @@ -10,7 +10,7 @@ import xarray as xr from pyproj import Transformer -from ...data_manager.services.downloader import get_cache_files, get_zarr_path +from ...data_manager.services.downloader import get_icechunk_path from ...data_manager.services.utils import get_time_dim, get_x_y_dims from ...shared.time import numpy_datetime_to_period_string @@ -25,21 +25,8 @@ def get_data( ) -> xr.Dataset: """Load an xarray raster dataset for a given time range and bbox.""" logger.info("Opening dataset") - zarr_path = get_zarr_path(dataset) - if zarr_path: - logger.info(f"Using optimized zarr file: {zarr_path}") - ds = open_zarr_dataset(str(zarr_path)) - else: - logger.warning( - f"Could not find optimized zarr file for dataset {dataset['id']}, using slower netcdf files instead." - ) - files = get_cache_files(dataset) - ds = xr.open_mfdataset( - files, - data_vars="minimal", - coords="minimal", # pyright: ignore[reportArgumentType] - compat="override", - ) + store_path = get_icechunk_path(dataset) + ds = open_icechunk_dataset(store_path) if start and end: logger.info(f"Subsetting time to {start} and {end}") @@ -73,25 +60,10 @@ def get_data_coverage(dataset: dict[str, Any]) -> dict[str, Any]: def get_data_coverage_for_paths( dataset: dict[str, Any], *, - zarr_path: str | None = None, - netcdf_paths: list[str] | None = None, + zarr_path: str, ) -> dict[str, Any]: - """Return coverage metadata for the concrete files created for one artifact.""" - if zarr_path is not None and netcdf_paths: - raise ValueError("Provide either zarr_path or netcdf_paths when computing coverage, not both") - if zarr_path is None and not netcdf_paths: - raise ValueError("Coverage calculation requires either zarr_path or at least one netcdf path") - - if zarr_path is not None: - ds = open_zarr_dataset(zarr_path) - else: - assert netcdf_paths is not None - ds = xr.open_mfdataset( - netcdf_paths, - data_vars="minimal", - coords="minimal", # pyright: ignore[reportArgumentType] - compat="override", - ) + """Return coverage metadata for a materialized flat-zarr artifact.""" + ds = open_zarr_dataset(zarr_path) from climate_api import config as api_config diff --git a/climate_api/data_manager/services/downloader.py b/climate_api/data_manager/services/downloader.py index 3d7ed233..2dacfbea 100644 --- a/climate_api/data_manager/services/downloader.py +++ b/climate_api/data_manager/services/downloader.py @@ -52,20 +52,9 @@ def _get_cache_prefix(dataset: dict[str, Any]) -> str: return str(dataset["id"]) -def get_cache_files(dataset: dict[str, Any]) -> list[Path]: - """Return all NetCDF cache files matching this dataset's prefix.""" - # TODO: not bulletproof -- e.g. 2m_temperature matches 2m_temperature_modified - prefix = _get_cache_prefix(dataset) - return list(DOWNLOAD_DIR.glob(f"{prefix}*.nc")) - - -def get_zarr_path(dataset: dict[str, Any]) -> Path | None: - """Return the optimised zarr archive path if it exists.""" - prefix = _get_cache_prefix(dataset) - optimized = DOWNLOAD_DIR / f"{prefix}.zarr" - if optimized.exists(): - return optimized - return None +def get_icechunk_path(dataset: dict[str, Any]) -> Path: + """Return the Icechunk store path for a dataset (may not exist yet).""" + return DOWNLOAD_DIR / f"{_get_cache_prefix(dataset)}.icechunk" def _get_dynamic_function(full_path: str) -> Callable[..., Any]: diff --git a/climate_api/data_registry/services/datasets.py b/climate_api/data_registry/services/datasets.py index b9e257ee..1a5aea61 100644 --- a/climate_api/data_registry/services/datasets.py +++ b/climate_api/data_registry/services/datasets.py @@ -155,21 +155,3 @@ def _validate_dataset_template(dataset: object, *, source: str) -> None: plugin = ingestion.get("plugin") if not (isinstance(plugin, str) and plugin): raise ValueError(f"Dataset template '{dataset_id}' in {source} must define ingestion.plugin") - - sync_availability = sync_block.get("availability") if isinstance(sync_block, dict) else None - if sync_availability is not None: - _validate_sync_availability(sync_availability, dataset_id=dataset_id, source=source) - - -def _validate_sync_availability(sync_availability: object, *, dataset_id: str, source: str) -> None: - """Validate optional source availability policy metadata.""" - if not isinstance(sync_availability, dict): - raise ValueError(f"Dataset template '{dataset_id}' in {source} has invalid sync.availability") - - latest_available_function = sync_availability.get("latest_available_function") - if latest_available_function is not None and ( - not isinstance(latest_available_function, str) or not latest_available_function - ): - raise ValueError( - f"Dataset template '{dataset_id}' in {source} has invalid sync.availability.latest_available_function" - ) diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 73f4f4a3..a7e07270 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -286,7 +286,7 @@ def _create_icechunk_artifact( list(bbox) if bbox is not None else (list(extent["bbox"]) if extent else [-180, -90, 180, 90]) ) _check_bbox_overlap(dataset, resolved_bbox) - store_path = downloader.DOWNLOAD_DIR / f"{dataset_id}.icechunk" + store_path = downloader.get_icechunk_path(dataset) if overwrite and store_path.exists(): import shutil diff --git a/climate_api/transforms/reproject.py b/climate_api/transforms/reproject.py index d3842e9b..775f1a17 100644 --- a/climate_api/transforms/reproject.py +++ b/climate_api/transforms/reproject.py @@ -24,7 +24,7 @@ def reproject_to_instance_crs( ``params`` dict if your source uses a different input CRS. The dataset must already have ``x`` and ``y`` as its spatial dimension names, - which is guaranteed by ``build_dataset_zarr`` before transforms are run. + which is guaranteed by the ingest orchestrator before transforms are run. """ target_crs = api_config.get_crs() if target_crs == source_crs: diff --git a/tests/test_dataset_registry.py b/tests/test_dataset_registry.py index 58ffabd9..60730caf 100644 --- a/tests/test_dataset_registry.py +++ b/tests/test_dataset_registry.py @@ -138,52 +138,3 @@ def test_dataset_registry_accepts_supported_sync_execution( assert datasets.list_datasets()[0]["sync"]["execution"] == "append" -def test_dataset_registry_rejects_invalid_sync_availability_function( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - registry_file = tmp_path / "invalid_sync_availability.yaml" - registry_file.write_text( - """ -- id: invalid_sync_availability - name: Invalid sync availability - variable: value - period_type: daily - sync: - kind: temporal - availability: - latest_available_function: 42 - ingestion: - plugin: some.ingest.Plugin -""", - encoding="utf-8", - ) - monkeypatch.setattr(datasets, "CONFIGS_DIR", tmp_path) - - with pytest.raises(ValueError, match="invalid sync.availability.latest_available_function"): - datasets.list_datasets() - - -def test_dataset_registry_accepts_sync_availability_function( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - registry_file = tmp_path / "valid_sync_availability.yaml" - registry_file.write_text( - """ -- id: valid_sync_availability - name: Valid sync availability - variable: value - period_type: daily - sync: - kind: temporal - availability: - latest_available_function: some.module.latest_available - ingestion: - plugin: some.ingest.Plugin -""", - encoding="utf-8", - ) - monkeypatch.setattr(datasets, "CONFIGS_DIR", tmp_path) - - assert datasets.list_datasets()[0]["sync"]["availability"]["latest_available_function"].endswith("latest_available") From c3ce2ef538faa32469b97c7dba3bc0f0f44bfd1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 02:01:51 +0200 Subject: [PATCH 37/80] =?UTF-8?q?docs:=20remove=20automatic=20reprojection?= =?UTF-8?q?=20references=20=E2=80=94=20plugin=20CRS=20is=20the=20stored=20?= =?UTF-8?q?CRS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The legacy build_dataset_zarr pipeline applied reproject_to_instance_crs as a fixed step. The Icechunk plugin path does not: data is stored in whatever CRS the plugin declares via GridSpec.crs. reproject_to_instance_crs is still available as an explicit declared transform when needed. Co-Authored-By: Claude Sonnet 4.6 --- docs/architecture.md | 47 ++++++++++++++++++++++------------------ docs/transforms.md | 9 +++++--- docs/zarr_and_geozarr.md | 4 ++-- 3 files changed, 34 insertions(+), 26 deletions(-) diff --git a/docs/architecture.md b/docs/architecture.md index 25dbf5ec..1c635fea 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -172,15 +172,14 @@ Processes are named operations triggered via `POST /processes/{id}/execution`. T Transforms are applied at a consistent point in the ingestion lifecycle: -1. ingestion function writes raw NetCDF files to disk -2. framework reads and normalises the data into an xarray Dataset -3. `_run_transforms(ds, dataset)` applies each declared transform in order -4. result is reprojected to instance CRS -5. zarr store is written with auto-computed chunking -6. framework writes GeoZarr root attributes -7. framework computes coverage from the zarr +1. plugin fetches one period of raw data as an xarray Dataset in the source CRS +2. `_run_transforms(ds, dataset)` applies each declared transform in order +3. orchestrator writes the period to the Icechunk store +4. framework writes GeoZarr root attributes after the first period is committed -Transforms see post-download, pre-reproject data. They should only modify data values and variable-level attributes. The framework writes dataset-level attributes (GeoZarr) after the transform pipeline completes. +Transforms see raw fetched values in the source CRS and source units. They should only modify data values and variable-level attributes. The framework writes dataset-level attributes (GeoZarr) after the first write completes. + +No automatic reprojection occurs. Data is stored in whatever CRS the plugin returns (declared via `GridSpec.crs` in `probe()`). If CRS conversion is needed, declare `reproject_to_instance_crs` as an explicit transform in the `transforms` list. --- @@ -194,13 +193,24 @@ Every zarr artifact must have GeoZarr root attributes for map rendering to work The map viewer reads `spatial:bbox` and `proj:code` to determine where to position tiles on the map. -**The framework writes these attributes — plugins do not.** They are written in `build_dataset_zarr` after transforms and reprojection, using the actual coordinate bounds of the final written data and the instance CRS. +**The framework writes these attributes — plugins do not.** They are written by the orchestrator after the first period is committed, using the actual coordinate bounds of the written data and the CRS declared in `GridSpec.crs`. --- ## CRS handling -The instance CRS is configured in `climate-api.yaml`: +Datasets are stored in whatever CRS the plugin returns. The plugin declares this via `GridSpec.crs` in its `probe()` response, and the framework writes `proj:code` accordingly. No automatic reprojection occurs. + +If you need to reproject data to a specific CRS, declare `reproject_to_instance_crs` as an explicit transform in the dataset template: + +```yaml +transforms: + - function: climate_api.transforms.reproject_to_instance_crs + params: + source_crs: EPSG:32633 +``` + +The instance CRS (used as the reprojection target when `reproject_to_instance_crs` is declared) is configured in `climate-api.yaml`: ```yaml extent: @@ -209,10 +219,6 @@ extent: crs: EPSG:32633 # optional; defaults to EPSG:4326 ``` -Downloaded data is reprojected from the source CRS (`source_crs` in the template, default `EPSG:4326`) to the instance CRS during ingestion. The stored zarr is always in the instance CRS. - -If no `crs` is set in the config, data is stored in `EPSG:4326` (WGS84). This is the correct default for instances that do not need a metric CRS. - --- ## Artifact deduplication and version history @@ -235,11 +241,10 @@ Plugin code (ingestion functions, transforms, processes) can rely on the followi | Concern | Where handled | | ----------------------------------------------------- | ------------------------------------------- | -| Coordinate name normalisation (`lat` → `y`, etc.) | `build_dataset_zarr` | -| Reprojection to instance CRS | `reproject_to_instance_crs` | -| Zarr chunking (auto-sized from `extents.temporal.resolution`) | `_compute_time_space_chunks` | -| Multiscale pyramid generation (when dims > 2048×2048) | `build_dataset_zarr` | -| GeoZarr root attributes (`spatial:bbox`, `proj:code`) | `build_dataset_zarr` | +| Coordinate name normalisation (`lat` → `y`, etc.) | Plugin (returns canonical `x`/`y`/`time`) | +| Zarr chunk sizing (time: 1 per period → rechunk pass) | `rechunk_store` (if `rechunk_time` set) | +| Multiscale pyramid generation (when dims > 2048×2048) | `build_pyramid_store` (if `pyramid=True`) | +| GeoZarr root attributes (`spatial:bbox`, `proj:code`) | Orchestrator after first period commit | | Artifact coverage computation | `_coverage_from_dataset` | | Artifact record persistence | `_store_artifact` | | pygeoapi publication | `publish_artifact_record` if `publish=true` | @@ -265,6 +270,6 @@ For **legacy ZARR datasets** (downloader-based, no `ingestion.plugin`), `append` For **plugin-path** datasets, `append` compares the pending period list against the already-committed time coordinates in the Icechunk store and fetches only the missing periods. The Icechunk store itself is the source of truth — no separate download cache. A crash leaves the store at the last committed period; restart resumes from there without any additional recovery logic. -### Transforms run after download, before reproject +### Transforms run per period, before the zarr write -Transforms see raw downloaded values in the source CRS and source units. The order is: download → transform → reproject → write zarr. +Transforms see raw fetched values in the plugin's source CRS and units. The order per period is: fetch → transform → write zarr. diff --git a/docs/transforms.md b/docs/transforms.md index 83f3e47f..e7fd43f9 100644 --- a/docs/transforms.md +++ b/docs/transforms.md @@ -50,12 +50,15 @@ Used by: ERA5-Land total precipitation (`era5land_precipitation_hourly`). ## Reprojection -Reprojection to the instance CRS is handled automatically by the ingestion pipeline as a separate step after all user-declared transforms have run. It is not a transform and should not be declared in the `transforms` list. +No automatic reprojection occurs. Data is stored in whatever CRS the plugin returns (declared via `GridSpec.crs` in `probe()`). -If your source data is not in `EPSG:4326`, declare `source_crs` in the dataset template so the pipeline knows what to reproject from: +If you need to reproject to a different CRS, declare `reproject_to_instance_crs` as an explicit transform. It reprojects from `source_crs` (default `EPSG:4326`) to the instance CRS set in `climate-api.yaml`: ```yaml -source_crs: EPSG:32633 +transforms: + - function: climate_api.transforms.reproject_to_instance_crs + params: + source_crs: EPSG:32633 ``` --- diff --git a/docs/zarr_and_geozarr.md b/docs/zarr_and_geozarr.md index 48ae3f52..55ed6643 100644 --- a/docs/zarr_and_geozarr.md +++ b/docs/zarr_and_geozarr.md @@ -104,7 +104,7 @@ A plain Zarr store has no concept of spatial coordinates. A map viewer opening i | `proj:code` | `EPSG:4326` | CRS of the stored coordinates | | `zarr_conventions` | `[{...}]` | Convention declarations | -These attributes are computed from the actual coordinate bounds of the written data and the instance CRS. They are always written by the framework after transforms and reprojection have run. This guarantees they always reflect the final stored data. +These attributes are computed from the actual coordinate bounds of the written data and the CRS declared by the plugin in `GridSpec.crs`. They are written by the framework after the first period is committed. This guarantees they always reflect the final stored data. `zarr_conventions` for a flat store contains the base GeoZarr convention declaration. For pyramid stores it also includes a multiscales entry that declares the level structure. @@ -120,7 +120,7 @@ extent: crs: EPSG:32633 # optional; defaults to EPSG:4326 ``` -Datasets are always stored in the instance CRS. During ingestion, data is reprojected from its source CRS (declared as `source_crs` in the template, default `EPSG:4326`) to the instance CRS. The stored `spatial:bbox` is therefore in the instance CRS — UTM eastings and northings for a UTM instance, degrees for a WGS84 instance. +Datasets are stored in whatever CRS the plugin returns. The plugin declares this via `GridSpec.crs` in its `probe()` response, and the framework writes `proj:code` from that value. No automatic reprojection occurs — if CRS conversion is needed, declare `reproject_to_instance_crs` as an explicit transform in the dataset template. The stored `spatial:bbox` is in the plugin's native CRS. STAC metadata also stores the WGS84 bounding box alongside the native bbox, so catalogue clients that expect geographic coordinates always get one regardless of the instance CRS. From cd46ebdf09bf57e8dfef9ed6c143c3bd19ba4b09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 10:07:48 +0200 Subject: [PATCH 38/80] =?UTF-8?q?fix:=20address=20Copilot=20review=20?= =?UTF-8?q?=E2=80=94=20pyramid=20append=20guard,=20WorldPop=20validation,?= =?UTF-8?q?=20commit=5Fbatch=5Fsize=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - _supports_append: Icechunk pyramid stores (multiscales in root attrs) now fall back to rematerialize; appending to a pyramidized store would write flat data to root alongside the pyramid levels - WorldPopPlugin.__init__: raise ValueError for missing country_code so the caller gets HTTP 400 instead of a silent invalid-URL ingest failure - services.py: also catch ValueError from load_plugin (maps to HTTP 400) - test_ingest_plugins.py: add 'import rioxarray' to register .rio accessor before test helpers call da.rio.set_spatial_dims / da.rio.write_crs - commit_batch_size: corrected in protocol.py docstring, architecture.md (×3), and extensibility.md (×2) — it controls cursor checkpoint frequency, not Icechunk commit frequency (every period is always committed individually) - built_in_datasets.md: ERA5-Land ingest description updated (hourly periods, not monthly-then-aggregated); WorldPop country_code source clarified Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/plugins/worldpop.py | 8 +++++++- climate_api/ingest/protocol.py | 11 ++++++----- climate_api/ingestions/services.py | 2 +- climate_api/ingestions/sync_engine.py | 23 +++++++++++++++++++++++ docs/architecture.md | 6 +++--- docs/built_in_datasets.md | 6 +++--- docs/extensibility.md | 4 ++-- tests/test_ingest_plugins.py | 1 + 8 files changed, 46 insertions(+), 15 deletions(-) diff --git a/climate_api/ingest/plugins/worldpop.py b/climate_api/ingest/plugins/worldpop.py index 8a28b1e1..69691dc9 100644 --- a/climate_api/ingest/plugins/worldpop.py +++ b/climate_api/ingest/plugins/worldpop.py @@ -49,7 +49,13 @@ class WorldPopPlugin: pyramid: bool = True def __init__(self, country_code: str = "", version: str = "global2") -> None: - self.country_code = country_code.upper() + cc = country_code.strip().upper() + if not cc: + raise ValueError( + "WorldPopPlugin requires a 3-letter ISO country code (e.g. 'NOR'). " + "Set extent.country_code or ingestion.params.country_code." + ) + self.country_code = cc self.version = version # ------------------------------------------------------------------ diff --git a/climate_api/ingest/protocol.py b/climate_api/ingest/protocol.py index 15bdeba1..95cf269f 100644 --- a/climate_api/ingest/protocol.py +++ b/climate_api/ingest/protocol.py @@ -49,11 +49,12 @@ class IngestionPlugin(Protocol): Keep at 1 for sources with large per-period files or rate-limited APIs. Raise for sources where individual periods are small (< 50 MB). - commit_batch_size: how often the job cursor checkpoint is saved. - Every period is always committed individually to Icechunk; this - controls how frequently the orchestrator persists the cursor so that a - restart resumes from the last checkpoint rather than re-scanning the - store. Use 1 for monthly sources, ~30 for daily, ~720 for hourly. + commit_batch_size: cursor checkpoint interval. + Every period is always committed individually to Icechunk. This + attribute controls how frequently the orchestrator persists the job + cursor so that a restart resumes from the last checkpoint rather than + re-scanning the whole store. Use 1 for monthly sources, ~30 for daily, + ~720 for hourly. rechunk_time (optional class attribute): target time chunk size for the post-ingest rechunk. When set, the orchestrator rewrites the store after diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index a7e07270..2190b056 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -300,7 +300,7 @@ def _create_icechunk_artifact( extra_params["country_code"] = extent_country_code try: plugin = load_plugin(plugin_path, params, extra_params=extra_params or None) - except TypeError as exc: + except (TypeError, ValueError) as exc: raise HTTPException(status_code=400, detail=f"Plugin configuration error: {exc}") from exc effective_start = ingest_start if ingest_start is not None else start diff --git a/climate_api/ingestions/sync_engine.py b/climate_api/ingestions/sync_engine.py index 4236df3e..765ab5fc 100644 --- a/climate_api/ingestions/sync_engine.py +++ b/climate_api/ingestions/sync_engine.py @@ -418,6 +418,29 @@ def _supports_append(source_dataset: dict[str, Any], latest_artifact: ArtifactRe from climate_api.ingestions.schemas import ArtifactFormat if latest_artifact.format == ArtifactFormat.ICECHUNK: + # Pyramid Icechunk stores have data under group "0"; appending to root + # would create a second flat dataset instead of extending the pyramid. + # Fall back to rematerialize so the full pyramid is rebuilt. + artifact_path = latest_artifact.path + if artifact_path: + from pathlib import Path + + from climate_api.ingest.store import open_or_create_repo + + try: + import zarr + + repo = open_or_create_repo(Path(artifact_path)) + session = repo.readonly_session("main") + root = zarr.open_group(session.store, mode="r") + if "multiscales" in root.attrs: + logger.warning( + "Sync append is not supported for pyramid Icechunk dataset '%s'; falling back to rematerialize", + source_dataset.get("id", ""), + ) + return False + except Exception: + pass # store missing or unreadable — let the ingest path handle it return True if source_dataset.get("sync", {}).get("execution") != SyncAction.APPEND.value: diff --git a/docs/architecture.md b/docs/architecture.md index 1c635fea..92f9012b 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -67,7 +67,7 @@ Orchestrator │ for each pending period: │ fetch_period() → xr.Dataset (in source CRS) │ to_zarr(icechunk_store, append_dim="time") - │ commit every commit_batch_size periods + │ commit every period; checkpoint cursor every commit_batch_size periods │ rechunk in-place (if rechunk_time is set) │ expire intermediate snapshots │ register ArtifactFormat.ICECHUNK artifact record @@ -126,7 +126,7 @@ The platform has four extension points. Each one has a narrow contract — the f ```python class MyPlugin: max_concurrency: int = 1 # parallel fetch limit - commit_batch_size: int = 1 # periods per Icechunk commit + commit_batch_size: int = 1 # cursor checkpoint interval (every period is committed) async def probe(self, bbox: list[float], **params) -> GridSpec: """Metadata-only probe — no data transfer.""" @@ -141,7 +141,7 @@ class MyPlugin: ... ``` -The orchestrator calls `probe()` once, `periods()` once, then drives a bounded-concurrency fetch loop — writing each period directly to an Icechunk store and committing every `commit_batch_size` periods. Plugins never touch zarr or Icechunk directly. +The orchestrator calls `probe()` once, `periods()` once, then drives a bounded-concurrency fetch loop — writing each period directly to an Icechunk store (one Icechunk commit per period) and checkpointing the job cursor every `commit_batch_size` periods. Plugins never touch zarr or Icechunk directly. See [Extensibility — Ingestion plugins](extensibility.md#ingestion-plugins) for the full protocol and `GridSpec` reference. diff --git a/docs/built_in_datasets.md b/docs/built_in_datasets.md index fd5f488f..4f68bc5c 100644 --- a/docs/built_in_datasets.md +++ b/docs/built_in_datasets.md @@ -44,7 +44,7 @@ CHIRPS (Climate Hazards Group InfraRed Precipitation with Station data) v3 is a ERA5-Land is a global atmospheric reanalysis produced by ECMWF. The 2 m temperature variable (`t2m`) represents the air temperature 2 metres above the land surface, including corrections for topography relative to the ERA5 pressure levels. -**Ingest method** — the DestinE zarr store is opened lazily over HTTPS. Each monthly period is fetched as an hourly slice, aggregated to daily values (mean), and written directly to the Icechunk store — no intermediate files on disk. The source's 0–360° longitude range is converted to −180–180° before writing. +**Ingest method** — the DestinE zarr store is opened lazily over HTTPS. Individual hourly periods are fetched and written directly to the Icechunk store — no intermediate files on disk. The source's 0–360° longitude range is converted to −180–180° before writing. `commit_batch_size = 720` checkpoints the cursor once per month of hourly data. **Sync behaviour** — new months are appended incrementally. ERA5-Land is published with a nominal 5-day lag; months closer than 120 hours to today are not requested. @@ -67,7 +67,7 @@ ERA5-Land is a global atmospheric reanalysis produced by ECMWF. The 2 m temperat Total precipitation (`tp`) from ERA5-Land is an accumulated hourly value representing the sum of large-scale and convective precipitation falling onto the land surface. It is useful as a high-resolution complement to CHIRPS for countries outside CHIRPS's 50°N–50°S band, or for sub-daily analysis. -**Ingest method** — same as ERA5-Land temperature: monthly periods fetched and aggregated, written directly to Icechunk. +**Ingest method** — same as ERA5-Land temperature: individual hourly periods fetched from DestinE and written directly to Icechunk. **Sync behaviour** — same 5-day lag as ERA5-Land temperature; months are appended incrementally. @@ -91,7 +91,7 @@ Total precipitation (`tp`) from ERA5-Land is an accumulated hourly value represe WorldPop Global2 provides gridded population estimates and projections at 100 m resolution. Each raster year represents estimated residential population counts. Years up to and including the present are backward-modelled estimates; years beyond the present are forward projections. -**Ingest method** — each year is downloaded as a per-country GeoTIFF from WorldPop's HTTP server (typically 50–200 MB per file), clipped to the configured bbox, and written directly to the Icechunk store. The country code must be set in `ingestion.params.country_code` in the dataset template (e.g. `NOR`, `GHA`). +**Ingest method** — each year is downloaded as a per-country GeoTIFF from WorldPop's HTTP server (typically 50–200 MB per file), clipped to the configured bbox, and written directly to the Icechunk store. A multiscale pyramid is built after the initial ingest. The country code is taken from `extent.country_code` in `climate-api.yaml` (preferred) or from `ingestion.params.country_code` in the dataset template. **Sync behaviour** — population data is released year by year. The API uses a `release`-kind sync that checks each calendar year separately. Future years (projections through 2030) are also requestable. diff --git a/docs/extensibility.md b/docs/extensibility.md index f6d6a621..6e15cba1 100644 --- a/docs/extensibility.md +++ b/docs/extensibility.md @@ -56,7 +56,7 @@ import xarray as xr class MyPlugin: max_concurrency: int = 1 # parallel fetch limit - commit_batch_size: int = 1 # periods per Icechunk commit + commit_batch_size: int = 1 # cursor checkpoint interval (every period is committed) async def probe(self, bbox: list[float], **params) -> GridSpec: """Metadata-only source probe. Returns grid shape, CRS, dtype. No data transfer.""" @@ -92,7 +92,7 @@ Set `time_dim=False` for static (time-invariant) datasets — the orchestrator i 2. Calls `periods()` once to get the full period list; filters against already-committed time coordinates. 3. Creates all fetch tasks upfront so up to `max_concurrency` fetches are in flight simultaneously. 4. Awaits tasks in chronological order so writes are always sequential. -5. Commits to the Icechunk store after every `commit_batch_size` periods. +5. Commits every period to the Icechunk store; checkpoints the job cursor every `commit_batch_size` periods so a restart resumes from the last checkpoint rather than the beginning. 6. On restart, resumes from the last committed period — a crash loses at most one uncommitted batch. 7. After all periods are written, runs a rechunk pass if the plugin declares `rechunk_time`, then expires intermediate Icechunk snapshots to prune history. diff --git a/tests/test_ingest_plugins.py b/tests/test_ingest_plugins.py index fc9d435d..0d4c6172 100644 --- a/tests/test_ingest_plugins.py +++ b/tests/test_ingest_plugins.py @@ -16,6 +16,7 @@ import numpy as np import pandas as pd import pytest +import rioxarray # noqa: F401 — registers the .rio accessor used in test helpers import xarray as xr from climate_api.ingest.protocol import GridSpec, IngestionPlugin From a9e842ff369c7e5df327d6213c95027ef8563fcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 12:29:25 +0200 Subject: [PATCH 39/80] feat: POST /ingestions Prefer: respond-async + progress callbacks Wires POST /ingestions into the async jobs framework via the Prefer: respond-async header. Returns HTTP 202 + Location: /jobs/{id} for background execution; progress is trackable via GET /jobs/{id}. - New execution.py: execute_ingest() function for the jobs framework - routes.py: Prefer: respond-async dispatch to submit_process_job() - services.py: on_progress / is_cancel_requested / save_cursor / load_cursor callbacks threaded through create_artifact() and _create_icechunk_artifact() to run_ingest_sync() Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingestions/execution.py | 52 +++++++++++++++++++++++++++++ climate_api/ingestions/routes.py | 43 +++++++++++++++++++++--- climate_api/ingestions/services.py | 22 ++++++++++++ 3 files changed, 113 insertions(+), 4 deletions(-) create mode 100644 climate_api/ingestions/execution.py diff --git a/climate_api/ingestions/execution.py b/climate_api/ingestions/execution.py new file mode 100644 index 00000000..b45011cb --- /dev/null +++ b/climate_api/ingestions/execution.py @@ -0,0 +1,52 @@ +"""Execution function for the ingest process (used by the async jobs framework).""" + +from __future__ import annotations + +from typing import Any + +from fastapi import HTTPException + +from climate_api.data_registry.services import datasets as registry_datasets +from climate_api.extents.services import get_extent_or_404 +from climate_api.ingestions import services + + +def execute_ingest( + *, + dataset_id: str, + start: str, + end: str | None = None, + overwrite: bool = False, + publish: bool = True, + on_progress: Any | None = None, + is_cancel_requested: Any | None = None, + save_cursor: Any | None = None, + load_cursor: Any | None = None, +) -> dict[str, Any]: + """Ingest one dataset for the configured extent and return a result summary. + + Accepts optional job-framework callbacks (on_progress, is_cancel_requested, + save_cursor, load_cursor) so that progress is visible when run as an async job. + """ + dataset = registry_datasets.get_dataset(dataset_id) + if dataset is None: + raise HTTPException(status_code=404, detail=f"Dataset '{dataset_id}' not found") + extent = get_extent_or_404() + resolved_bbox = list(extent["bbox"]) + artifact = services.create_artifact( + dataset=dataset, + start=start, + end=end, + bbox=resolved_bbox, + overwrite=overwrite, + prefer_zarr=False, + publish=publish, + on_progress=on_progress, + is_cancel_requested=is_cancel_requested, + save_cursor=save_cursor, + load_cursor=load_cursor, + ) + return { + "status": "completed", + "ingestion_id": artifact.artifact_id, + } diff --git a/climate_api/ingestions/routes.py b/climate_api/ingestions/routes.py index 3b33ef95..5567b3bb 100644 --- a/climate_api/ingestions/routes.py +++ b/climate_api/ingestions/routes.py @@ -1,6 +1,8 @@ """Routes for EO ingestion, datasets, and sync operations.""" -from fastapi import APIRouter, HTTPException +from typing import Any + +from fastapi import APIRouter, Header, HTTPException from fastapi.responses import FileResponse from starlette.responses import Response @@ -24,9 +26,42 @@ sync_router = APIRouter() -@ingestions_router.post("", response_model=IngestionResponse) -def create_ingestion(request: CreateIngestionRequest) -> IngestionResponse: - """Create or update a managed dataset from a dataset template and configured extent.""" +def _prefer_respond_async(prefer: str | None) -> bool: + if prefer is None: + return False + directives = [item.strip().split(";", 1)[0].strip().lower() for item in prefer.split(",")] + return "respond-async" in directives + + +@ingestions_router.post("", response_model=None) +def create_ingestion( + request: CreateIngestionRequest, + response: Response, + prefer: str | None = Header(default=None), +) -> Any: + """Create or update a managed dataset from a dataset template and configured extent. + + Send ``Prefer: respond-async`` to run the ingest as a background job and + receive HTTP 202 with a ``Location: /jobs/{job_id}`` header immediately. + Poll ``GET /jobs/{job_id}`` for progress and completion status. + """ + if _prefer_respond_async(prefer): + from climate_api.jobs.service import get_job_service + + job = get_job_service().submit_process_job( + process_id="ingest", + request={ + "dataset_id": request.dataset_id, + "start": request.start, + "end": request.end, + "overwrite": request.overwrite, + "publish": request.publish, + }, + ) + response.status_code = 202 + response.headers["Location"] = f"/jobs/{job.job_id}" + return job + dataset = _get_dataset_or_404(request.dataset_id) extent = get_extent_or_404() resolved_bbox = list(extent["bbox"]) diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 2190b056..8e055df6 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -7,6 +7,7 @@ import mimetypes import os from collections.abc import Callable +from typing import Any from datetime import UTC, datetime from pathlib import Path from uuid import uuid4 @@ -200,6 +201,10 @@ def create_artifact( publish: bool, download_start: str | None = None, download_end: str | None = None, + on_progress: Any | None = None, + is_cancel_requested: Any | None = None, + save_cursor: Any | None = None, + load_cursor: Any | None = None, ) -> ArtifactRecord: """Ingest a dataset via its plugin, persist it locally, and store artifact metadata.""" period_type = str(dataset["period_type"]) @@ -249,6 +254,10 @@ def create_artifact( overwrite=overwrite, publish=publish, ingest_start=download_start, + on_progress=on_progress, + is_cancel_requested=is_cancel_requested, + save_cursor=save_cursor, + load_cursor=load_cursor, ) @@ -262,6 +271,10 @@ def _create_icechunk_artifact( overwrite: bool = False, publish: bool, ingest_start: str | None = None, + on_progress: Any | None = None, + is_cancel_requested: Any | None = None, + save_cursor: Any | None = None, + load_cursor: Any | None = None, ) -> ArtifactRecord: """Run per-period Icechunk ingest and register the resulting store as an artifact. @@ -332,6 +345,10 @@ def _create_icechunk_artifact( rechunk_time=rechunk_time, apply_transforms=apply_transforms, pyramid=pyramid, + on_progress=on_progress, + is_cancel_requested=is_cancel_requested, + save_cursor=save_cursor, + load_cursor=load_cursor, ) if not store_path.exists(): @@ -343,6 +360,11 @@ def _create_icechunk_artifact( try: ds = xr.open_zarr(session.store) + # For pyramid stores the data and time live under group "0"; root has + # only multiscales metadata with empty coordinates. + if "time" not in ds.coords and "multiscales" in ds.attrs: + ds.close() + ds = xr.open_zarr(session.store, group="0") except Exception as exc: raise HTTPException(status_code=409, detail="Ingest produced no readable data for the requested range") from exc from climate_api import config as api_config From 85e6b1941cb6efabfdb40908a2a42870ef0f36de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 12:29:34 +0200 Subject: [PATCH 40/80] =?UTF-8?q?fix:=20pyramid=20map=20rendering=20?= =?UTF-8?q?=E2=80=94=20strip=20sharding,=20serve=20root=20for=20zoom=20sel?= =?UTF-8?q?ection?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related fixes that restore zoom-level-aware map rendering for Icechunk pyramid stores: 1. build_pyramid_store (store.py): topozarr writes sharding_indexed by default, which @carbonplan/zarr-layer 0.5.0 cannot decode, causing a browser render loop. Stripping the "shards" key from the topozarr encoding writes plain bytes+zstd chunks instead. 2. _public_zarr_asset_href (stac/services.py): the STAC zarr asset now points to the pyramid root URL for Icechunk stores. zarr-layer 0.5.0 reads the multiscales metadata from the root and selects the appropriate level based on the current map zoom — the same behaviour as regular zarr pyramids. Also adds xproj CRS assignment in build_pyramid_store, required by topozarr.create_pyramid() to read spatial extent metadata. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/store.py | 22 +++++++++++++++++++++- climate_api/stac/services.py | 19 +------------------ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/climate_api/ingest/store.py b/climate_api/ingest/store.py index efbe4a8b..afa9ab7e 100644 --- a/climate_api/ingest/store.py +++ b/climate_api/ingest/store.py @@ -95,6 +95,8 @@ def build_pyramid_store(store_path: Path, *, x_dim: str = "x", y_dim: str = "y") if not store_path.exists(): return + import zarr + repo = open_or_create_repo(store_path) read_session = repo.readonly_session("main") ds = xr.open_zarr(read_session.store) @@ -109,9 +111,27 @@ def build_pyramid_store(store_path: Path, *, x_dim: str = "x", y_dim: str = "y") finally: ds.close() + # topozarr requires xproj CRS on the dataset. Read it from the GeoZarr + # root attribute written by the orchestrator (proj:code = "EPSG:"). + try: + import xproj # noqa: F401 — registers .proj accessor + + root = zarr.open_group(read_session.store, mode="r") + proj_code = root.attrs.get("proj:code", "EPSG:4326") + epsg = int(proj_code.split(":")[1]) if ":" in proj_code else 4326 + ds_loaded = ds_loaded.proj.assign_crs({"EPSG": epsg}) + except Exception: + logger.warning("Could not assign CRS for pyramid build on %s; proceeding without xproj", store_path) + pyramid = create_pyramid(ds_loaded, levels=levels, x_dim=x_dim, y_dim=y_dim) + # Strip "shards" from topozarr encoding: sharding_indexed codec isn't supported + # by zarr-layer (JS) client, causing a render loop in the map viewer. + no_shard_encoding = { + level: {var: {k: v for k, v in enc.items() if k != "shards"} for var, enc in vars_.items()} + for level, vars_ in pyramid.encoding.items() + } write_session = repo.writable_session("main") - pyramid.dt.to_zarr(write_session.store, mode="w", encoding=pyramid.encoding, zarr_format=3) + pyramid.dt.to_zarr(write_session.store, mode="w", encoding=no_shard_encoding, zarr_format=3) write_session.commit(f"pyramid: {levels} levels") logger.info("Built %d-level pyramid for %s (%dx%d)", levels, store_path, nx, ny) diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index ed6cf329..d0ced39e 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -313,10 +313,7 @@ def _public_zarr_asset_href( source_dataset: dict[str, Any], ) -> str: artifact_path = _artifact_store_path(artifact) - if artifact.format == ArtifactFormat.ICECHUNK: - if _is_icechunk_pyramid(artifact_path): - return _abs_url(request, f"/zarr/{dataset_id}/0") - elif _is_pyramid_zarr(artifact_path): + if _is_pyramid_zarr(artifact_path): return _abs_url(request, f"/zarr/{dataset_id}/0") return _abs_url(request, f"/zarr/{dataset_id}") @@ -328,20 +325,6 @@ def _is_pyramid_zarr(artifact_path: str) -> bool: return (Path(artifact_path) / "0").is_dir() -def _is_icechunk_pyramid(store_path: str) -> bool: - """Return True if the Icechunk store contains a multiscale pyramid.""" - try: - import zarr - - from climate_api.ingest.store import open_or_create_repo - - repo = open_or_create_repo(Path(store_path)) - session = repo.readonly_session("main") - root = zarr.open_group(session.store, mode="r") - return "multiscales" in root.attrs - except Exception: - return False - def _abs_url(request: Request, path: str) -> str: base_url = os.getenv("CLIMATE_API_BASE_URL") From 9d81dcac31bd9fa5bd88fa381e58eebb9619d38d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 12:30:35 +0200 Subject: [PATCH 41/80] docs: update STAC pyramid asset href behaviour for Icechunk stores Icechunk pyramid stores now expose the root URL so zarr-layer can read multiscales and select the zoom-appropriate level automatically. Co-Authored-By: Claude Sonnet 4.6 --- docs/implementation-status.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/implementation-status.md b/docs/implementation-status.md index cec1425a..dd061ca8 100644 --- a/docs/implementation-status.md +++ b/docs/implementation-status.md @@ -153,7 +153,7 @@ Published Zarr-backed managed datasets appear there as one STAC Collection per d Current STAC details: -- pyramid Zarr stores (detected by the presence of a `0/` level on disk) expose `/zarr/{dataset_id}/0` as the canonical asset href +- Icechunk pyramid stores (detected by `multiscales` in root attrs) expose the root `/zarr/{dataset_id}` as the asset href so `@carbonplan/zarr-layer` can read the `multiscales` metadata and select the appropriate zoom level automatically; regular flat-Zarr pyramid stores (legacy) expose `/zarr/{dataset_id}/0` - temporal extents are normalized to RFC 3339 in both STAC and Datacube temporal extent fields - STAC collection `license` currently defaults to `various` - spatial `step` values are rounded for readability while preserving axis direction From 731cd3a1b36c8516d37fbde7a90bc94209c9c3e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 12:33:31 +0200 Subject: [PATCH 42/80] docs: add Icechunk section to zarr_and_geozarr.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Covers why Icechunk is used (MVCC, safe resume, snapshot isolation, prunable history), the snapshot lifecycle during a typical ingest, and how serving changed from FileResponse to IcechunkStore key resolution. Also updates the stale store-path reference (.zarr → .icechunk) and the serving section to reflect the new backend. Co-Authored-By: Claude Sonnet 4.6 --- docs/zarr_and_geozarr.md | 49 ++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/docs/zarr_and_geozarr.md b/docs/zarr_and_geozarr.md index 55ed6643..df78bad3 100644 --- a/docs/zarr_and_geozarr.md +++ b/docs/zarr_and_geozarr.md @@ -49,12 +49,47 @@ The Climate API targets the same access pattern at country scale for arbitrary s ## Store layout on disk -Each managed dataset has exactly one Zarr store on disk, stored under `{data_dir}/downloads/{dataset_id}.zarr`. The store is either: +Each managed dataset has exactly one Icechunk repository on disk, stored under `{data_dir}/downloads/{dataset_id}.icechunk`. The zarr content inside the repository is either: -- **Flat** — a single-resolution Zarr store with dimensions `(time, x, y)` -- **Pyramid** — a multi-resolution Zarr store with levels `0/`, `1/`, `2/`, … where `0/` is the full resolution +- **Flat** — a single-resolution store with dimensions `(time, x, y)` +- **Pyramid** — a multi-resolution store with levels `0/`, `1/`, `2/`, … where `0/` is the full resolution -The flat vs. pyramid decision is made at build time based on spatial size (see [Multiscale pyramids](#multiscale-pyramids) below). +The flat vs. pyramid decision is made at ingest time based on spatial size (see [Multiscale pyramids](#multiscale-pyramids) below). + +--- + +## Icechunk — versioned Zarr storage + +[Icechunk](https://icechunk.io) is a transactional storage layer that sits between the application and the underlying Zarr v3 data. It exposes a standard Zarr store interface to writers and readers, but adds **MVCC (multi-version concurrency control)**: every write is committed as an immutable snapshot, and readers always see a consistent view of the data regardless of concurrent writes. + +### Why Icechunk + +Plain Zarr on disk is a directory of independent chunk files — there is no transaction boundary. If an ingest is interrupted mid-write, some chunks for a new time step may be written and others not, leaving the store in an inconsistent state with no way to roll back. + +The Climate API ingests one period at a time, committing each as a separate Icechunk snapshot. This gives three concrete properties: + +- **Safe resume** — if a job is cancelled or the server restarts, the next run reads the list of committed snapshots and skips periods that are already present. No partial writes are ever visible to readers. +- **Snapshot isolation** — a read session opened at the start of a request sees a consistent snapshot even if a concurrent ingest is writing new periods. Readers are never blocked by writers. +- **Prunable history** — intermediate per-period snapshots accumulate during ingest. After the rechunk and pyramid passes complete, `expire_snapshots()` prunes all but the latest, keeping disk usage proportional to data size rather than ingest history. + +### Snapshot lifecycle + +A typical WorldPop ingest produces snapshots roughly like this: + +``` +snapshot 1: write period 2015 +snapshot 2: write period 2016 +... +snapshot 16: write period 2030 +snapshot 17: rechunk: time=1 +snapshot 18: pyramid: 6 levels +→ expire_snapshots() prunes snapshots 1–17 +snapshot 18: (the only surviving snapshot — full pyramid, correctly chunked) +``` + +### Serving from Icechunk + +Zarr keys are read directly from the Icechunk session store rather than from files on disk. The HTTP surface is identical — the same `/zarr/{dataset_id}/` routes — but the backend resolves each key through the Icechunk MVCC layer instead of a `FileResponse`. --- @@ -128,7 +163,7 @@ STAC metadata also stores the WGS84 bounding box alongside the native bbox, so c ## How Zarr stores are served -The `/zarr/{dataset_id}/` endpoint serves individual files from the Zarr store directory using FastAPI's `FileResponse`. The ZarrLayer client issues one HTTP request per chunk file it needs. +The `/zarr/{dataset_id}/` endpoint serves Zarr keys from the Icechunk repository. The ZarrLayer client issues one HTTP request per key it needs: ``` GET /zarr/{dataset_id}/zarr.json → root metadata (JSON) @@ -136,9 +171,9 @@ GET /zarr/{dataset_id}/precip/c/0/0/0 → chunk at time=0, x=0, y=0 GET /zarr/{dataset_id}/time/c/0 → time coordinate chunk ``` -Metadata files (`zarr.json`) are returned as `application/json`. All other files — chunk data — are returned as `application/octet-stream`. Directory paths return a JSON listing of their contents. +Each request opens a readonly Icechunk session pinned to the latest committed snapshot, resolves the zarr key through the MVCC layer, and returns the raw bytes. Metadata files (`zarr.json`) are returned as `application/json`; chunk data as `application/octet-stream`; directory paths as a JSON listing. -This design means the zarr store is served by ordinary file serving — there is no zarr-specific server middleware. +The HTTP surface is identical to serving files from disk — ZarrLayer and other zarr clients require no changes — but correctness and consistency are guaranteed by Icechunk's snapshot model rather than filesystem state. --- From 9a611598a6bd6c8358a3e3813cb469b172d661d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 12:49:54 +0200 Subject: [PATCH 43/80] fix: handle bare group paths in Icechunk zarr serving; fix legacy remote_zarr format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - _serve_icechunk_key: detect bare group-path requests (e.g. "0") that lack a "." or "/c/" marker and return an HTML directory listing so fsspec HTTP _ls_real can parse children — enables xr.open_zarr(group="0") over HTTP for Icechunk pyramid stores - _upgrade_legacy_record: map legacy "remote_zarr" format → "zarr" so old artifact records load without validation errors - examples: add .compute() before .item() on dask array results; fix snapshot variable shadowing in zarr_direct_access Co-Authored-By: Claude Sonnet 4.6 --- climate_api/client.py | 5 +-- climate_api/ingestions/services.py | 25 +++++++++++++- climate_api/stac/services.py | 25 ++++++++++++-- examples/stac_discover_and_open.py | 35 ++++++++++++++----- examples/zarr_direct_access.py | 54 +++++++++++++++++++++--------- 5 files changed, 116 insertions(+), 28 deletions(-) diff --git a/climate_api/client.py b/climate_api/client.py index 7b820e41..aa07c540 100644 --- a/climate_api/client.py +++ b/climate_api/client.py @@ -71,8 +71,9 @@ def open(self, dataset_id: str) -> xr.Dataset: """Open a published dataset as an xarray Dataset. Fetches the STAC collection for ``dataset_id``, reads the Zarr asset - metadata, and returns the opened dataset. Coordinates are always - ``time``, ``latitude``, and ``longitude``. + metadata, and returns the opened dataset. Spatial coordinate names + reflect the plugin's native convention (e.g. ``x``/``y`` for raster + sources, ``longitude``/``latitude`` for ERA5-style sources). """ response = self._http.get(f"{self.base_url}/stac/collections/{dataset_id}") response.raise_for_status() diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 8e055df6..86d66b62 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -16,7 +16,7 @@ import pyproj import xarray as xr from fastapi import HTTPException -from fastapi.responses import FileResponse, JSONResponse +from fastapi.responses import FileResponse, HTMLResponse, JSONResponse from starlette.responses import Response from climate_api import config as api_config @@ -738,6 +738,27 @@ def _serve_icechunk_key(dataset_id: str, artifact: ArtifactRecord, relative_path "entries": entries, } + # Detect bare group-path requests (e.g. "0", "0/precip"). + # fsspec HTTP _ls_real issues GET without a trailing slash; zarr chunk/metadata + # keys always contain a "." (zarr.json) or "/c/" (chunk coordinates), so anything + # that matches neither pattern must be a group path. Return an HTML directory + # listing so fsspec can parse the children via links. + last_segment = key.rsplit("/", 1)[-1] + is_chunk_key = "/c/" in key or key.startswith("c/") + is_file_key = "." in last_segment + if not is_chunk_key and not is_file_key: + root: zarr.Group = zarr.open_group(session.store, mode="r") + try: + node: zarr.Group = root[key] # type: ignore[assignment] + except KeyError: + raise HTTPException(status_code=404, detail=f"Zarr path '{relative_path}' not found") + children = sorted(node.keys()) + html_lines = [""] + for child in children: + html_lines.append(f'{child}/') + html_lines.append("") + return HTMLResponse("\n".join(html_lines)) + try: import zarr.core.buffer @@ -1145,6 +1166,8 @@ def _as_optional_str(value: object) -> str | None: def _upgrade_legacy_record(item: dict[str, object]) -> dict[str, object]: """Backfill newer schema fields for records created before migrations existed.""" + if item.get("format") == "remote_zarr": + item = {**item, "format": "zarr"} if "request_scope" not in item: coverage = item.get("coverage") if isinstance(coverage, dict): diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index d0ced39e..bd38ea72 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -446,8 +446,29 @@ def _zarr_asset_metadata(artifact: ArtifactRecord) -> dict[str, object]: return metadata -def _zarr_open_kwargs(artifact: ArtifactRecord) -> dict[str, bool | None]: - return {"consolidated": _zarr_consolidated_flag(_artifact_store_path(artifact))} +def _zarr_open_kwargs(artifact: ArtifactRecord) -> dict[str, object]: + artifact_path = _artifact_store_path(artifact) + if artifact.format == ArtifactFormat.ICECHUNK: + # Icechunk stores served over HTTP must use consolidated=False so that + # xarray reads zarr.json metadata directly rather than attempting HTTP + # directory listings (which our endpoint doesn't support). + # Pyramid stores also need group="0" — the root URL is exposed for + # zarr-layer zoom selection but data variables live under group "0". + kwargs: dict[str, object] = {"consolidated": False} + try: + import zarr + + from climate_api.ingest.store import open_or_create_repo + + repo = open_or_create_repo(Path(artifact_path)) + session = repo.readonly_session("main") + root = zarr.open_group(session.store, mode="r") + if "multiscales" in root.attrs: + kwargs["group"] = "0" + except Exception: + pass + return kwargs + return {"consolidated": _zarr_consolidated_flag(artifact_path)} def _build_renders(artifact: ArtifactRecord, source_dataset: dict[str, Any]) -> dict[str, Any] | None: diff --git a/examples/stac_discover_and_open.py b/examples/stac_discover_and_open.py index 3e845274..23f8e274 100644 --- a/examples/stac_discover_and_open.py +++ b/examples/stac_discover_and_open.py @@ -6,11 +6,21 @@ import json +import xarray as xr + from climate_api.client import Client BASE_URL = "http://127.0.0.1:8000" +def _spatial_dims(ds: xr.Dataset) -> tuple[str | None, str | None]: + """Return (y_dim, x_dim) from the dataset's dimension names.""" + dims = [str(d) for d in ds.dims] + y_dim = next((d for d in dims if d.lower() in ("y", "latitude", "lat")), None) + x_dim = next((d for d in dims if d.lower() in ("x", "longitude", "lon")), None) + return y_dim, x_dim + + def main() -> None: """Discover and open the first published dataset.""" api = Client(BASE_URL) @@ -28,16 +38,25 @@ def main() -> None: ds = api.open(first["id"]) print(ds) - print(f"\nTime range: {ds.time.values[0]} → {ds.time.values[-1]}") - print(f"Time steps: {ds.sizes['time']}") - print(f"Latitude: {ds.latitude.min().item()} → {ds.latitude.max().item()}") - print(f"Longitude: {ds.longitude.min().item()} → {ds.longitude.max().item()}") + y_dim, x_dim = _spatial_dims(ds) + + if "time" in ds.dims: + print(f"\nTime range: {ds.time.values[0]} → {ds.time.values[-1]}") + print(f"Time steps: {ds.sizes['time']}") + if y_dim: + print(f"{y_dim}: {ds[y_dim].min().item():.4f} → {ds[y_dim].max().item():.4f}") + if x_dim: + print(f"{x_dim}: {ds[x_dim].min().item():.4f} → {ds[x_dim].max().item():.4f}") variable = list(ds.data_vars)[0] - centre_lat = ds.latitude.mean().item() - centre_lon = ds.longitude.mean().item() - sample = ds[variable].isel(time=0).sel(latitude=centre_lat, longitude=centre_lon, method="nearest") - print(f"\n{variable} at domain centre, t=0: {sample.item()}") + if y_dim and x_dim: + centre = { + y_dim: ds[y_dim].mean().item(), + x_dim: ds[x_dim].mean().item(), + } + selector = {"time": 0} if "time" in ds.dims else {} + sample = ds[variable].isel(**selector).sel(**centre, method="nearest").compute() + print(f"\n{variable} at domain centre, {'t=0' if selector else 'static'}: {sample.item()}") if __name__ == "__main__": diff --git a/examples/zarr_direct_access.py b/examples/zarr_direct_access.py index 5cc722df..58edb175 100644 --- a/examples/zarr_direct_access.py +++ b/examples/zarr_direct_access.py @@ -4,11 +4,21 @@ Adjust BASE_URL if the API is not running on the default local address. """ +import xarray as xr + from climate_api.client import Client BASE_URL = "http://127.0.0.1:8000" +def _spatial_dims(ds: xr.Dataset) -> tuple[str | None, str | None]: + """Return (y_dim, x_dim) from the dataset's dimension names.""" + dims = [str(d) for d in ds.dims] + y_dim = next((d for d in dims if d.lower() in ("y", "latitude", "lat")), None) + x_dim = next((d for d in dims if d.lower() in ("x", "longitude", "lon")), None) + return y_dim, x_dim + + def main() -> None: """Open a Zarr store directly and demonstrate spatial and temporal subsetting.""" api = Client(BASE_URL) @@ -23,30 +33,44 @@ def main() -> None: ds = api.open(first["id"]) print(ds) + y_dim, x_dim = _spatial_dims(ds) + print(f"\nDimensions: {dict(ds.sizes)}") - print(f"Time range: {ds.time.values[0]} → {ds.time.values[-1]}") - print(f"Latitude: {ds.latitude.min().item()} → {ds.latitude.max().item()}") - print(f"Longitude: {ds.longitude.min().item()} → {ds.longitude.max().item()}") + if "time" in ds.dims: + print(f"Time range: {ds.time.values[0]} → {ds.time.values[-1]}") + if y_dim: + print(f"{y_dim}: {ds[y_dim].min().item():.4f} → {ds[y_dim].max().item():.4f}") + if x_dim: + print(f"{x_dim}: {ds[x_dim].min().item():.4f} → {ds[x_dim].max().item():.4f}") variable = list(ds.data_vars)[0] # Select a single time step - t0 = ds.time.values[0] - snapshot = ds[variable].sel(time=t0) - print(f"\n{variable} snapshot at {t0}:") - print(f" shape: {snapshot.shape}, min: {snapshot.min().item()}, max: {snapshot.max().item()}") + if "time" in ds.dims: + t0 = ds.time.values[0] + snapshot = ds[variable].sel(time=t0) + print(f"\n{variable} snapshot at {t0}:") + snap = snapshot.compute() + print(f" shape: {snap.shape}, min: {snap.min().item():.4f}, max: {snap.max().item():.4f}") # Select the point closest to the spatial centre of the domain - centre_lat = ds.latitude.mean().item() - centre_lon = ds.longitude.mean().item() - point = ds[variable].sel(latitude=centre_lat, longitude=centre_lon, method="nearest") - print(f"\n{variable} at domain centre ({centre_lat:.2f}, {centre_lon:.2f}):") - print(point.to_dataframe()[[variable]].head(10)) + if y_dim and x_dim: + centre = { + y_dim: ds[y_dim].mean().item(), + x_dim: ds[x_dim].mean().item(), + } + point = ds[variable].sel(**centre, method="nearest") + print(f"\n{variable} at domain centre ({centre[y_dim]:.2f}, {centre[x_dim]:.2f}):") + if "time" in ds.dims: + print(point.to_dataframe()[[variable]].head(10)) + else: + print(f" value: {point.compute().item()}") # Spatial mean over the full domain — first 10 time steps - spatial_mean = ds[variable].isel(time=slice(10)).mean(dim=["latitude", "longitude"]) - print(f"\nSpatial mean {variable} time series (first 10 steps):") - print(spatial_mean.to_dataframe()[[variable]]) + if "time" in ds.dims and y_dim and x_dim: + spatial_mean = ds[variable].isel(time=slice(10)).mean(dim=[y_dim, x_dim]) + print(f"\nSpatial mean {variable} time series (first 10 steps):") + print(spatial_mean.to_dataframe()[[variable]]) if __name__ == "__main__": From 5e76d6c6217cea9fd03af8a4b1bcc6d3ce79c715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 12:51:08 +0200 Subject: [PATCH 44/80] =?UTF-8?q?refactor:=20simplify=20examples=20?= =?UTF-8?q?=E2=80=94=20spatial=20dims=20are=20always=20x/y?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the _spatial_dims helper from both examples and access ds.x / ds.y directly. All stored datasets normalise spatial dimensions to (time, x, y) at write time, so the dynamic lookup was dead weight. Also update client.Client.open docstring to state the x/y guarantee. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/client.py | 5 ++-- examples/stac_discover_and_open.py | 30 +++++---------------- examples/zarr_direct_access.py | 43 +++++++++--------------------- 3 files changed, 20 insertions(+), 58 deletions(-) diff --git a/climate_api/client.py b/climate_api/client.py index aa07c540..95636ce6 100644 --- a/climate_api/client.py +++ b/climate_api/client.py @@ -71,9 +71,8 @@ def open(self, dataset_id: str) -> xr.Dataset: """Open a published dataset as an xarray Dataset. Fetches the STAC collection for ``dataset_id``, reads the Zarr asset - metadata, and returns the opened dataset. Spatial coordinate names - reflect the plugin's native convention (e.g. ``x``/``y`` for raster - sources, ``longitude``/``latitude`` for ERA5-style sources). + metadata, and returns the opened dataset. Spatial dimensions are always + named ``x`` and ``y``; the time dimension, when present, is ``time``. """ response = self._http.get(f"{self.base_url}/stac/collections/{dataset_id}") response.raise_for_status() diff --git a/examples/stac_discover_and_open.py b/examples/stac_discover_and_open.py index 23f8e274..88fffaff 100644 --- a/examples/stac_discover_and_open.py +++ b/examples/stac_discover_and_open.py @@ -6,21 +6,11 @@ import json -import xarray as xr - from climate_api.client import Client BASE_URL = "http://127.0.0.1:8000" -def _spatial_dims(ds: xr.Dataset) -> tuple[str | None, str | None]: - """Return (y_dim, x_dim) from the dataset's dimension names.""" - dims = [str(d) for d in ds.dims] - y_dim = next((d for d in dims if d.lower() in ("y", "latitude", "lat")), None) - x_dim = next((d for d in dims if d.lower() in ("x", "longitude", "lon")), None) - return y_dim, x_dim - - def main() -> None: """Discover and open the first published dataset.""" api = Client(BASE_URL) @@ -38,25 +28,17 @@ def main() -> None: ds = api.open(first["id"]) print(ds) - y_dim, x_dim = _spatial_dims(ds) - if "time" in ds.dims: print(f"\nTime range: {ds.time.values[0]} → {ds.time.values[-1]}") print(f"Time steps: {ds.sizes['time']}") - if y_dim: - print(f"{y_dim}: {ds[y_dim].min().item():.4f} → {ds[y_dim].max().item():.4f}") - if x_dim: - print(f"{x_dim}: {ds[x_dim].min().item():.4f} → {ds[x_dim].max().item():.4f}") + print(f"y: {ds.y.min().item():.4f} → {ds.y.max().item():.4f}") + print(f"x: {ds.x.min().item():.4f} → {ds.x.max().item():.4f}") variable = list(ds.data_vars)[0] - if y_dim and x_dim: - centre = { - y_dim: ds[y_dim].mean().item(), - x_dim: ds[x_dim].mean().item(), - } - selector = {"time": 0} if "time" in ds.dims else {} - sample = ds[variable].isel(**selector).sel(**centre, method="nearest").compute() - print(f"\n{variable} at domain centre, {'t=0' if selector else 'static'}: {sample.item()}") + centre = {"y": ds.y.mean().item(), "x": ds.x.mean().item()} + selector = {"time": 0} if "time" in ds.dims else {} + sample = ds[variable].isel(**selector).sel(**centre, method="nearest").compute() + print(f"\n{variable} at domain centre, {'t=0' if selector else 'static'}: {sample.item()}") if __name__ == "__main__": diff --git a/examples/zarr_direct_access.py b/examples/zarr_direct_access.py index 58edb175..d2129d71 100644 --- a/examples/zarr_direct_access.py +++ b/examples/zarr_direct_access.py @@ -4,21 +4,11 @@ Adjust BASE_URL if the API is not running on the default local address. """ -import xarray as xr - from climate_api.client import Client BASE_URL = "http://127.0.0.1:8000" -def _spatial_dims(ds: xr.Dataset) -> tuple[str | None, str | None]: - """Return (y_dim, x_dim) from the dataset's dimension names.""" - dims = [str(d) for d in ds.dims] - y_dim = next((d for d in dims if d.lower() in ("y", "latitude", "lat")), None) - x_dim = next((d for d in dims if d.lower() in ("x", "longitude", "lon")), None) - return y_dim, x_dim - - def main() -> None: """Open a Zarr store directly and demonstrate spatial and temporal subsetting.""" api = Client(BASE_URL) @@ -33,42 +23,33 @@ def main() -> None: ds = api.open(first["id"]) print(ds) - y_dim, x_dim = _spatial_dims(ds) - print(f"\nDimensions: {dict(ds.sizes)}") if "time" in ds.dims: print(f"Time range: {ds.time.values[0]} → {ds.time.values[-1]}") - if y_dim: - print(f"{y_dim}: {ds[y_dim].min().item():.4f} → {ds[y_dim].max().item():.4f}") - if x_dim: - print(f"{x_dim}: {ds[x_dim].min().item():.4f} → {ds[x_dim].max().item():.4f}") + print(f"y: {ds.y.min().item():.4f} → {ds.y.max().item():.4f}") + print(f"x: {ds.x.min().item():.4f} → {ds.x.max().item():.4f}") variable = list(ds.data_vars)[0] # Select a single time step if "time" in ds.dims: t0 = ds.time.values[0] - snapshot = ds[variable].sel(time=t0) + snap = ds[variable].sel(time=t0).compute() print(f"\n{variable} snapshot at {t0}:") - snap = snapshot.compute() print(f" shape: {snap.shape}, min: {snap.min().item():.4f}, max: {snap.max().item():.4f}") # Select the point closest to the spatial centre of the domain - if y_dim and x_dim: - centre = { - y_dim: ds[y_dim].mean().item(), - x_dim: ds[x_dim].mean().item(), - } - point = ds[variable].sel(**centre, method="nearest") - print(f"\n{variable} at domain centre ({centre[y_dim]:.2f}, {centre[x_dim]:.2f}):") - if "time" in ds.dims: - print(point.to_dataframe()[[variable]].head(10)) - else: - print(f" value: {point.compute().item()}") + cy, cx = ds.y.mean().item(), ds.x.mean().item() + point = ds[variable].sel(y=cy, x=cx, method="nearest") + print(f"\n{variable} at domain centre ({cy:.2f}, {cx:.2f}):") + if "time" in ds.dims: + print(point.to_dataframe()[[variable]].head(10)) + else: + print(f" value: {point.compute().item()}") # Spatial mean over the full domain — first 10 time steps - if "time" in ds.dims and y_dim and x_dim: - spatial_mean = ds[variable].isel(time=slice(10)).mean(dim=[y_dim, x_dim]) + if "time" in ds.dims: + spatial_mean = ds[variable].isel(time=slice(10)).mean(dim=["y", "x"]) print(f"\nSpatial mean {variable} time series (first 10 steps):") print(spatial_mean.to_dataframe()[[variable]]) From f79c387ab085450f95cbabf6fda27342fd8147bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 12:53:33 +0200 Subject: [PATCH 45/80] refactor: remove _upgrade_legacy_record No backwards compatibility needed; drop the legacy schema-patching function entirely. The remote_zarr gefs_precipitation record was removed from the local data directory manually. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingestions/services.py | 45 ++---------------------------- 1 file changed, 2 insertions(+), 43 deletions(-) diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 86d66b62..23767f9f 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -780,7 +780,7 @@ def _serve_icechunk_key(dataset_id: str, artifact: ArtifactRecord, relative_path def _load_records() -> list[ArtifactRecord]: ensure_store() raw = json.loads(ARTIFACTS_INDEX_PATH.read_text(encoding="utf-8")) - return [ArtifactRecord.model_validate(_upgrade_legacy_record(item)) for item in raw] + return [ArtifactRecord.model_validate(item) for item in raw] def _save_records(records: list[ArtifactRecord]) -> None: @@ -885,7 +885,7 @@ def _mutate_records(mutation: Callable[[list[ArtifactRecord]], ArtifactRecord]) portalocker.lock(handle, portalocker.LOCK_EX) handle.seek(0) raw = handle.read() - records = [ArtifactRecord.model_validate(_upgrade_legacy_record(item)) for item in json.loads(raw or "[]")] + records = [ArtifactRecord.model_validate(item) for item in json.loads(raw or "[]")] result = mutation(records) payload = [record.model_dump(mode="json") for record in records] handle.seek(0) @@ -1162,44 +1162,3 @@ def _dataset_links(dataset_id: str, latest: ArtifactRecord) -> list[DatasetAcces def _as_optional_str(value: object) -> str | None: return value if isinstance(value, str) else None - - -def _upgrade_legacy_record(item: dict[str, object]) -> dict[str, object]: - """Backfill newer schema fields for records created before migrations existed.""" - if item.get("format") == "remote_zarr": - item = {**item, "format": "zarr"} - if "request_scope" not in item: - coverage = item.get("coverage") - if isinstance(coverage, dict): - spatial = coverage.get("spatial") - temporal = coverage.get("temporal") - bbox: tuple[float, float, float, float] | None = None - if isinstance(spatial, dict): - xmin = spatial.get("xmin") - ymin = spatial.get("ymin") - xmax = spatial.get("xmax") - ymax = spatial.get("ymax") - if ( - isinstance(xmin, int | float) - and isinstance(ymin, int | float) - and isinstance(xmax, int | float) - and isinstance(ymax, int | float) - ): - bbox = (float(xmin), float(ymin), float(xmax), float(ymax)) - - start = "" - end: str | None = None - if isinstance(temporal, dict): - raw_start = temporal.get("start") - raw_end = temporal.get("end") - if isinstance(raw_start, str): - start = raw_start - if isinstance(raw_end, str): - end = raw_end - - item["request_scope"] = { - "start": start, - "end": end, - "bbox": bbox, - } - return item From 9ce61f0fa619ce11e5205ada3cfd37d9bf16c6a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 13:09:15 +0200 Subject: [PATCH 46/80] =?UTF-8?q?refactor:=20remove=20legacy=20code=20?= =?UTF-8?q?=E2=80=94=20prefer=5Fzarr,=20NETCDF=20format,=20artifact.path,?= =?UTF-8?q?=20dead=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - client.py: remove module-level list_datasets() / open_dataset() duplicates - schemas.py: remove ArtifactFormat.NETCDF, ArtifactRecord.path, and prefer_zarr from CreateIngestionRequest and SyncDatasetRequest - routes.py (ingestions): remove /datasets/{id}/download NetCDF endpoint - services.py (ingestions): remove prefer_zarr param throughout, simplify _artifact_storage_exists / _find_existing_artifact_in_records / _upsert_icechunk_artifact_record to use asset_paths only - sync_engine.py: remove prefer_zarr param, replace latest_artifact.path with asset_paths[0] in pyramid append detection - system/routes.py, processing/resample.py, execution.py: remove prefer_zarr kwarg - publications/services.py: replace path-or-asset_paths[0] fallback with asset_paths[0] - stac/services.py: remove _clear_xstac_collection_cache, simplify _artifact_store_path to use asset_paths directly - tests: update all call sites; remove NETCDF-specific tests; replace fixture call to removed _clear_xstac_collection_cache Co-Authored-By: Claude Sonnet 4.6 --- climate_api/client.py | 58 ----------------- climate_api/ingestions/execution.py | 1 - climate_api/ingestions/routes.py | 19 +----- climate_api/ingestions/schemas.py | 7 --- climate_api/ingestions/services.py | 46 ++++---------- climate_api/ingestions/sync_engine.py | 26 ++++---- climate_api/processing/resample.py | 4 +- climate_api/publications/services.py | 8 +-- climate_api/stac/services.py | 6 -- climate_api/system/routes.py | 3 +- tests/test_client.py | 91 +-------------------------- tests/test_datasets.py | 24 ++----- tests/test_datasets_sync.py | 20 +++--- tests/test_processing_resample.py | 51 +++------------ tests/test_stac.py | 5 +- 15 files changed, 55 insertions(+), 314 deletions(-) diff --git a/climate_api/client.py b/climate_api/client.py index 95636ce6..08318104 100644 --- a/climate_api/client.py +++ b/climate_api/client.py @@ -1,19 +1,13 @@ """Lightweight client for discovering and opening published Climate API datasets.""" -import os from urllib.parse import urlparse import httpx import xarray as xr -_FALLBACK_BASE_URL = "http://127.0.0.1:8000" _DEFAULT_TIMEOUT = 30.0 -def _default_base_url() -> str: - return os.environ.get("CLIMATE_API_BASE_URL", _FALLBACK_BASE_URL) - - def _id_from_href(href: str) -> str: """Extract the dataset id from a STAC child href by reading the last URL path segment.""" return urlparse(href).path.rstrip("/").rsplit("/", 1)[-1] @@ -92,55 +86,3 @@ def open(self, dataset_id: str) -> xr.Dataset: if not isinstance(open_kwargs, dict): raise ValueError(f"Zarr asset for '{dataset_id}' has a malformed xarray:open_kwargs field") return xr.open_zarr(href, **open_kwargs) # type: ignore[no-any-return] - - -def list_datasets(base_url: str | None = None) -> list[dict]: - """Return all published datasets from the STAC catalog. - - Each entry is a STAC child link dict with at least ``id``, ``title``, and ``href``. - ``base_url`` defaults to the ``CLIMATE_API_BASE_URL`` environment variable, - falling back to ``http://127.0.0.1:8000``. - """ - url = (base_url or _default_base_url()).rstrip("/") - response = httpx.get(f"{url}/stac/catalog.json", timeout=_DEFAULT_TIMEOUT) - response.raise_for_status() - catalog = response.json() - raw_links = catalog.get("links") - if not isinstance(raw_links, list): - raise ValueError(f"Invalid STAC catalog response from {url}: missing or non-list 'links' field") - links = [] - for link in raw_links: - if isinstance(link, dict) and link.get("rel") == "child": - href = link.get("href") - if not isinstance(href, str) or not href: - raise ValueError(f"STAC child link from {url} has a missing or invalid href") - links.append({**link, "id": _id_from_href(href)}) - return links - - -def open_dataset(dataset_id: str, *, base_url: str | None = None) -> xr.Dataset: - """Open a published dataset as an xarray Dataset. - - Fetches the STAC collection for ``dataset_id``, reads the Zarr asset - metadata, and returns the opened dataset. Coordinates are always - ``time``, ``latitude``, and ``longitude``. - ``base_url`` defaults to the ``CLIMATE_API_BASE_URL`` environment variable, - falling back to ``http://127.0.0.1:8000``. - """ - url = (base_url or _default_base_url()).rstrip("/") - response = httpx.get(f"{url}/stac/collections/{dataset_id}", timeout=_DEFAULT_TIMEOUT) - response.raise_for_status() - collection = response.json() - assets = collection.get("assets") - if not isinstance(assets, dict): - raise ValueError(f"STAC collection for '{dataset_id}' from {url} has a missing or invalid 'assets' field") - asset = assets.get("zarr") - if not isinstance(asset, dict): - raise ValueError(f"Dataset '{dataset_id}' has no Zarr asset in the STAC collection") - href = asset.get("href") - if not isinstance(href, str) or not href: - raise ValueError(f"Zarr asset for '{dataset_id}' has a missing or invalid href") - open_kwargs = asset.get("xarray:open_kwargs", {}) - if not isinstance(open_kwargs, dict): - raise ValueError(f"Zarr asset for '{dataset_id}' has a malformed xarray:open_kwargs field") - return xr.open_zarr(href, **open_kwargs) # type: ignore[no-any-return] diff --git a/climate_api/ingestions/execution.py b/climate_api/ingestions/execution.py index b45011cb..2c3592be 100644 --- a/climate_api/ingestions/execution.py +++ b/climate_api/ingestions/execution.py @@ -39,7 +39,6 @@ def execute_ingest( end=end, bbox=resolved_bbox, overwrite=overwrite, - prefer_zarr=False, publish=publish, on_progress=on_progress, is_cancel_requested=is_cancel_requested, diff --git a/climate_api/ingestions/routes.py b/climate_api/ingestions/routes.py index 5567b3bb..8f2c52d8 100644 --- a/climate_api/ingestions/routes.py +++ b/climate_api/ingestions/routes.py @@ -3,7 +3,6 @@ from typing import Any from fastapi import APIRouter, Header, HTTPException -from fastapi.responses import FileResponse from starlette.responses import Response from climate_api.data_registry.routes import _get_dataset_or_404 @@ -71,7 +70,6 @@ def create_ingestion( end=request.end, bbox=resolved_bbox, overwrite=request.overwrite, - prefer_zarr=request.prefer_zarr, publish=request.publish, ) return IngestionResponse( @@ -105,20 +103,6 @@ def get_dataset(dataset_id: str) -> DatasetDetailRecord: return services.get_dataset_or_404(dataset_id) -@datasets_router.get("/{dataset_id}/download") -def download_artifact_file(dataset_id: str) -> FileResponse: - """Download the primary saved file for a dataset when available.""" - artifact = services.get_latest_artifact_for_dataset_or_404(dataset_id) - if artifact.path is None or artifact.format.value == "zarr": - raise HTTPException( - status_code=409, - detail="Dataset is not a single downloadable file; use metadata and dataset assets instead", - ) - - media_type = "application/x-netcdf" - filename = f"{dataset_id}.nc" - return FileResponse(artifact.path, media_type=media_type, filename=filename) - @zarr_router.api_route("/{dataset_id}", methods=["GET", "HEAD"]) def get_canonical_zarr_store_info(dataset_id: str) -> dict[str, object]: @@ -127,7 +111,7 @@ def get_canonical_zarr_store_info(dataset_id: str) -> dict[str, object]: @zarr_router.api_route("/{dataset_id}/{relative_path:path}", methods=["GET", "HEAD"], response_model=None) -def get_canonical_zarr_store_file(dataset_id: str, relative_path: str) -> FileResponse | Response | dict[str, object]: +def get_canonical_zarr_store_file(dataset_id: str, relative_path: str) -> Response | dict[str, object]: """Serve canonical Zarr store content for a managed dataset.""" return services.get_dataset_zarr_store_file_or_404(dataset_id, relative_path) @@ -138,7 +122,6 @@ def sync_dataset(dataset_id: str, request: SyncDatasetRequest) -> SyncResponse: return services.sync_dataset( dataset_id=dataset_id, end=request.end, - prefer_zarr=request.prefer_zarr, publish=request.publish, ) diff --git a/climate_api/ingestions/schemas.py b/climate_api/ingestions/schemas.py index 4f49253e..3c267993 100644 --- a/climate_api/ingestions/schemas.py +++ b/climate_api/ingestions/schemas.py @@ -10,7 +10,6 @@ class ArtifactFormat(StrEnum): """Supported stored artifact formats.""" ZARR = "zarr" - NETCDF = "netcdf" ICECHUNK = "icechunk" @@ -99,7 +98,6 @@ class ArtifactRecord(BaseModel): variable: str period_type: str | None = None format: ArtifactFormat - path: str | None = None asset_paths: list[str] = Field(default_factory=list) variables: list[str] = Field(default_factory=list) request_scope: ArtifactRequestScope @@ -118,10 +116,6 @@ class CreateIngestionRequest(BaseModel): default=False, description="Whether to force regeneration of an existing matching artifact.", ) - prefer_zarr: bool = Field( - default=True, - description="Whether to prefer GeoZarr materialization when available.", - ) publish: bool = Field( default=True, description="Whether to publish the resulting dataset through pygeoapi.", @@ -273,7 +267,6 @@ class SyncDatasetRequest(BaseModel): """Request payload for syncing a managed dataset forward.""" end: str | None = Field(default=None, description="Optional end period to sync through.") - prefer_zarr: bool = Field(default=True, description="Whether to prefer GeoZarr materialization when syncing.") publish: bool = Field(default=True, description="Whether to publish the resulting dataset version.") diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 23767f9f..55b5114e 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -197,7 +197,6 @@ def create_artifact( end: str | None, bbox: list[float] | None, overwrite: bool, - prefer_zarr: bool, publish: bool, download_start: str | None = None, download_end: str | None = None, @@ -231,7 +230,6 @@ def create_artifact( existing = _find_existing_artifact( dataset_id=str(dataset["id"]), request_scope=request_scope, - prefer_zarr=prefer_zarr, ) if existing is not None and not overwrite: logger.info( @@ -479,7 +477,6 @@ def store_materialized_zarr_artifact( variable=str(dataset["variable"]), period_type=str(dataset.get("period_type")) if dataset.get("period_type") is not None else None, format=ArtifactFormat.ZARR, - path=str(zarr_path.resolve()), asset_paths=[str(zarr_path.resolve())], variables=[str(dataset["variable"])], request_scope=request_scope, @@ -487,7 +484,7 @@ def store_materialized_zarr_artifact( created_at=datetime.now(UTC), publication=ArtifactPublication(), ) - stored_record = _upsert_artifact_record(record, prefer_zarr=True, publish=publish, overwrite=overwrite) + stored_record = _upsert_artifact_record(record, publish=publish, overwrite=overwrite) if publish and stored_record.publication.status != PublicationStatus.PUBLISHED: return publish_artifact_record(stored_record.artifact_id) return stored_record @@ -497,7 +494,6 @@ def sync_dataset( *, dataset_id: str, end: str | None, - prefer_zarr: bool, publish: bool, ) -> SyncResponse: """Resolve sync inputs and delegate managed-dataset sync to the sync engine. @@ -515,11 +511,11 @@ def sync_dataset( if source_dataset is None: raise HTTPException(status_code=404, detail=f"Source dataset '{latest_artifact.dataset_id}' not found") committed_end: str | None = None - if latest_artifact.format == ArtifactFormat.ICECHUNK and latest_artifact.path: + if latest_artifact.format == ArtifactFormat.ICECHUNK and latest_artifact.asset_paths: from climate_api.ingest.store import read_committed_period_ids period_type = str(source_dataset.get("period_type", "")) - committed = read_committed_period_ids(Path(latest_artifact.path), period_type) + committed = read_committed_period_ids(Path(latest_artifact.asset_paths[0]), period_type) committed_end = max(committed) if committed else None logger.info( "Icechunk store-based current_end for '%s': %s (artifact record had: %s)", @@ -533,7 +529,6 @@ def sync_dataset( latest_artifact=latest_artifact, source_dataset=source_dataset, requested_end=end, - prefer_zarr=prefer_zarr, publish=publish, create_artifact_fn=create_artifact, get_dataset_fn=get_dataset_or_404, @@ -598,7 +593,7 @@ def _icechunk_store_info(dataset_id: str, artifact: ArtifactRecord) -> dict[str, from climate_api.ingest.store import open_or_create_repo - store_path = Path(artifact.path or artifact.asset_paths[0]) + store_path = Path(artifact.asset_paths[0]) if not store_path.exists(): raise HTTPException(status_code=404, detail="Icechunk store not found on disk") @@ -707,7 +702,7 @@ def _serve_icechunk_key(dataset_id: str, artifact: ArtifactRecord, relative_path from climate_api.ingest.store import open_or_create_repo - store_path = Path(artifact.path or artifact.asset_paths[0]) + store_path = Path(artifact.asset_paths[0]) if not store_path.exists(): raise HTTPException(status_code=404, detail="Icechunk store not found on disk") @@ -792,7 +787,6 @@ def _save_records(records: list[ArtifactRecord]) -> None: def _store_artifact_record( record: ArtifactRecord, *, - prefer_zarr: bool, publish: bool, ) -> ArtifactRecord: """Persist a newly created artifact record while avoiding lost updates.""" @@ -802,7 +796,6 @@ def mutate(records: list[ArtifactRecord]) -> ArtifactRecord: records=records, dataset_id=record.dataset_id, request_scope=record.request_scope, - prefer_zarr=prefer_zarr, ) if existing is not None: if publish and existing.publication.status != PublicationStatus.PUBLISHED: @@ -825,7 +818,7 @@ def _upsert_icechunk_artifact_record(record: ArtifactRecord) -> ArtifactRecord: def mutate(records: list[ArtifactRecord]) -> ArtifactRecord: for i, existing in enumerate(records): - if existing.dataset_id == record.dataset_id and existing.path == record.path: + if existing.dataset_id == record.dataset_id and existing.asset_paths == record.asset_paths: replacement = record.model_copy( update={ "artifact_id": existing.artifact_id, @@ -843,20 +836,18 @@ def mutate(records: list[ArtifactRecord]) -> ArtifactRecord: def _upsert_artifact_record( record: ArtifactRecord, *, - prefer_zarr: bool, publish: bool, overwrite: bool, ) -> ArtifactRecord: """Persist a new or replacement artifact record for the same logical request scope.""" if not overwrite: - return _store_artifact_record(record, prefer_zarr=prefer_zarr, publish=publish) + return _store_artifact_record(record, publish=publish) def mutate(records: list[ArtifactRecord]) -> ArtifactRecord: existing = _find_existing_artifact_in_records( records=records, dataset_id=record.dataset_id, request_scope=record.request_scope, - prefer_zarr=prefer_zarr, ) if existing is None: records.append(record) @@ -902,7 +893,7 @@ def _get_zarr_root_or_409(artifact: ArtifactRecord) -> Path: if artifact.format != ArtifactFormat.ZARR: raise HTTPException(status_code=409, detail="Artifact is not a Zarr store") - store_root = Path(artifact.path or artifact.asset_paths[0]).resolve() + store_root = Path(artifact.asset_paths[0]).resolve() if not store_root.exists() or not store_root.is_dir(): raise HTTPException(status_code=404, detail="Zarr store path does not exist on disk") return store_root @@ -946,14 +937,12 @@ def _find_existing_artifact( *, dataset_id: str, request_scope: ArtifactRequestScope, - prefer_zarr: bool, ) -> ArtifactRecord | None: """Return an existing artifact for an identical logical request when possible.""" return _find_existing_artifact_in_records( records=_load_records(), dataset_id=dataset_id, request_scope=request_scope, - prefer_zarr=prefer_zarr, ) @@ -1019,7 +1008,6 @@ def _find_existing_artifact_in_records( records: list[ArtifactRecord], dataset_id: str, request_scope: ArtifactRequestScope, - prefer_zarr: bool, ) -> ArtifactRecord | None: """Return an existing artifact for an identical logical request from a provided record set.""" for record in reversed(records): @@ -1043,8 +1031,6 @@ def _find_existing_artifact_in_records( record.request_scope.end, ) continue - if prefer_zarr and record.format not in (ArtifactFormat.ZARR, ArtifactFormat.ICECHUNK): - continue return record return None @@ -1067,14 +1053,9 @@ def _materialized_records(records: list[ArtifactRecord]) -> list[ArtifactRecord] def _artifact_storage_exists(record: ArtifactRecord) -> bool: """Return whether an artifact's on-disk backing files are still present.""" - paths: list[str] = [] - if record.path is not None: - paths.append(record.path) - if record.asset_paths: - paths.extend(record.asset_paths) - if not paths: + if not record.asset_paths: return False - return all(Path(path).exists() for path in paths) + return all(Path(path).exists() for path in record.asset_paths) def _temporal_coverage_matches_request_scope( @@ -1146,13 +1127,8 @@ def _dataset_links(dataset_id: str, latest: ArtifactRecord) -> list[DatasetAcces DatasetAccessLink(href=f"/datasets/{dataset_id}", rel="self", title="Dataset detail"), DatasetAccessLink(href=f"/zarr/{dataset_id}", rel="zarr", title="Zarr store"), ] - zarr_formats = {ArtifactFormat.ZARR, ArtifactFormat.ICECHUNK} - if latest.publication.status == PublicationStatus.PUBLISHED and latest.format in zarr_formats: + if latest.publication.status == PublicationStatus.PUBLISHED and latest.format in {ArtifactFormat.ZARR, ArtifactFormat.ICECHUNK}: links.append(DatasetAccessLink(href=f"/stac/collections/{dataset_id}", rel="stac", title="STAC collection")) - if latest.format == ArtifactFormat.NETCDF: - links.append( - DatasetAccessLink(href=f"/datasets/{dataset_id}/download", rel="download", title="Download NetCDF") - ) if latest.publication.pygeoapi_path is not None: links.append( DatasetAccessLink(href=latest.publication.pygeoapi_path, rel="ogc-collection", title="OGC collection") diff --git a/climate_api/ingestions/sync_engine.py b/climate_api/ingestions/sync_engine.py index 765ab5fc..6e9d7a7b 100644 --- a/climate_api/ingestions/sync_engine.py +++ b/climate_api/ingestions/sync_engine.py @@ -168,7 +168,6 @@ def run_sync( latest_artifact: ArtifactRecord, source_dataset: dict[str, Any], requested_end: str | None, - prefer_zarr: bool, publish: bool, create_artifact_fn: Callable[..., ArtifactRecord], get_dataset_fn: Callable[[str], Any], @@ -249,7 +248,6 @@ def run_sync( download_end=sync_detail.delta_end if download_start is not None else None, bbox=list(latest_artifact.request_scope.bbox) if latest_artifact.request_scope.bbox is not None else None, overwrite=False, - prefer_zarr=prefer_zarr, publish=publish, ) logger.info( @@ -421,8 +419,7 @@ def _supports_append(source_dataset: dict[str, Any], latest_artifact: ArtifactRe # Pyramid Icechunk stores have data under group "0"; appending to root # would create a second flat dataset instead of extending the pyramid. # Fall back to rematerialize so the full pyramid is rebuilt. - artifact_path = latest_artifact.path - if artifact_path: + if latest_artifact.asset_paths: from pathlib import Path from climate_api.ingest.store import open_or_create_repo @@ -430,7 +427,7 @@ def _supports_append(source_dataset: dict[str, Any], latest_artifact: ArtifactRe try: import zarr - repo = open_or_create_repo(Path(artifact_path)) + repo = open_or_create_repo(Path(latest_artifact.asset_paths[0])) session = repo.readonly_session("main") root = zarr.open_group(session.store, mode="r") if "multiscales" in root.attrs: @@ -447,13 +444,14 @@ def _supports_append(source_dataset: dict[str, Any], latest_artifact: ArtifactRe return False # Pyramid zarr stores cannot be appended to — they must be rebuilt in full. # Detect this from the existing artifact's on-disk structure rather than YAML. - from pathlib import Path - - artifact_path = latest_artifact.path - if artifact_path and "://" not in artifact_path and (Path(artifact_path) / "0").is_dir(): - logger.warning( - "Sync append execution is not supported for pyramid zarr dataset '%s'; falling back to rematerialize", - source_dataset.get("id", ""), - ) - return False + if latest_artifact.asset_paths: + from pathlib import Path + + artifact_path = latest_artifact.asset_paths[0] + if "://" not in artifact_path and (Path(artifact_path) / "0").is_dir(): + logger.warning( + "Sync append execution is not supported for pyramid zarr dataset '%s'; falling back to rematerialize", + source_dataset.get("id", ""), + ) + return False return True diff --git a/climate_api/processing/resample.py b/climate_api/processing/resample.py index ed3bc814..bbca9467 100644 --- a/climate_api/processing/resample.py +++ b/climate_api/processing/resample.py @@ -76,7 +76,6 @@ def materialize_resampled_artifact( existing = ingestion_services._find_existing_artifact( dataset_id=target_dataset_id, request_scope=ArtifactRequestScope(start=start, end=resolved_end), - prefer_zarr=True, ) if existing is not None and not overwrite: if publish and existing.publication.status != PublicationStatus.PUBLISHED: @@ -94,7 +93,7 @@ def materialize_resampled_artifact( target_managed_dataset_id = managed_dataset_id_for_scope(target_dataset_id) zarr_path = DERIVED_DATA_DIR / f"{target_managed_dataset_id}.zarr" - source_path = source_artifact.path or source_artifact.asset_paths[0] + source_path = source_artifact.asset_paths[0] if source_artifact.format == ArtifactFormat.ICECHUNK: source_ds = open_icechunk_dataset(source_path) else: @@ -238,7 +237,6 @@ def _find_existing_resampled_artifact( return ingestion_services._find_existing_artifact( dataset_id=target_dataset_id, request_scope=ArtifactRequestScope(start=start, end=realized_end), - prefer_zarr=True, ) diff --git a/climate_api/publications/services.py b/climate_api/publications/services.py index 8e254de4..e7444af2 100644 --- a/climate_api/publications/services.py +++ b/climate_api/publications/services.py @@ -47,7 +47,7 @@ def publish_artifact(record: ArtifactRecord) -> ArtifactRecord: from climate_api.ingestions.services import list_artifacts collection_id = managed_dataset_id_for(record) - data_path = record.path or record.asset_paths[0] + data_path = record.asset_paths[0] is_pyramid_zarr = record.format == ArtifactFormat.ZARR and (Path(data_path) / "0").is_dir() is_icechunk = record.format == ArtifactFormat.ICECHUNK published_record = record.model_copy( @@ -71,7 +71,7 @@ def publish_artifact(record: ArtifactRecord) -> ArtifactRecord: active = published_record if artifact.artifact_id == record.artifact_id else artifact if active.publication.status != PublicationStatus.PUBLISHED: continue - data_path = active.path or active.asset_paths[0] + data_path = active.asset_paths[0] if active.format == ArtifactFormat.ICECHUNK: continue # icechunk: not served via pygeoapi, use /zarr endpoint instead if active.format == ArtifactFormat.ZARR and (Path(data_path) / "0").is_dir(): @@ -120,7 +120,7 @@ def _build_collection_resource(record: ArtifactRecord) -> dict[str, Any]: provider: dict[str, Any] = { "type": "coverage", "name": "xarray", - "data": record.path or record.asset_paths[0], + "data": record.asset_paths[0], "x_field": x_field, "y_field": y_field, "time_field": time_field, @@ -164,7 +164,7 @@ def _provider_format(artifact_format: ArtifactFormat) -> dict[str, str]: def _provider_axes(record: ArtifactRecord) -> tuple[str, str, str]: """Inspect an artifact and return provider axis field names.""" - data_path = record.path or record.asset_paths[0] + data_path = record.asset_paths[0] if record.format == ArtifactFormat.ICECHUNK: ds = open_icechunk_dataset(data_path) elif record.format == ArtifactFormat.ZARR: diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index bd38ea72..0b9429fb 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -261,10 +261,6 @@ def _cache_xstac_collection_payload(artifact_id: str, payload: dict[str, Any]) - _xstac_collection_cache[artifact_id] = deepcopy(payload) -def _clear_xstac_collection_cache() -> None: - _xstac_collection_cache.clear() - - def _link_to_dict(link: pystac.Link) -> dict[str, Any]: target = link.target href = target if isinstance(target, str) else link.href @@ -296,8 +292,6 @@ def _required_zarr_asset(template: pystac.Collection) -> pystac.Asset: def _artifact_store_path(artifact: ArtifactRecord) -> str: - if artifact.path: - return artifact.path if artifact.asset_paths: return artifact.asset_paths[0] raise HTTPException( diff --git a/climate_api/system/routes.py b/climate_api/system/routes.py index 6298dd65..a0fd5439 100644 --- a/climate_api/system/routes.py +++ b/climate_api/system/routes.py @@ -73,7 +73,6 @@ async def manage_ingest(request: Request) -> RedirectResponse: end=end, bbox=resolved_bbox, overwrite=overwrite, - prefer_zarr=True, publish=publish, ) name = urllib.parse.quote(template.get("name", dataset_id)) @@ -99,7 +98,7 @@ async def manage_sync(request: Request) -> RedirectResponse: dataset_id = str(form.get("dataset_id", "")) publish = "publish" in form - sync_dataset(dataset_id=dataset_id, end=None, prefer_zarr=True, publish=publish) + sync_dataset(dataset_id=dataset_id, end=None, publish=publish) return RedirectResponse(f"{base}/manage?message=Sync+completed", status_code=303) except HTTPException as exc: msg = urllib.parse.quote(str(exc.detail)) diff --git a/tests/test_client.py b/tests/test_client.py index 16e1a0b4..44509d36 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -1,13 +1,11 @@ from pathlib import Path from unittest.mock import MagicMock, patch -import httpx import numpy as np import pandas as pd -import pytest import xarray as xr -from climate_api.client import Client, _id_from_href, list_datasets, open_dataset +from climate_api.client import Client, _id_from_href def _make_catalog(hrefs: list[str]) -> dict: @@ -51,93 +49,6 @@ def test_id_from_href_strips_trailing_slash() -> None: assert _id_from_href("http://localhost/stac/collections/ds/") == "ds" -# ── module-level list_datasets ───────────────────────────────────────────────── - - -def test_list_datasets_returns_child_links() -> None: - catalog = _make_catalog(["http://localhost/stac/collections/chirps3_precipitation_daily_rwa"]) - with patch("climate_api.client.httpx.get", return_value=_make_response(catalog)) as mock_get: - result = list_datasets("http://localhost") - - mock_get.assert_called_once_with("http://localhost/stac/catalog.json", timeout=30) - assert len(result) == 1 - assert result[0]["rel"] == "child" - assert "chirps3" in result[0]["href"] - - -def test_list_datasets_returns_empty_for_no_children() -> None: - catalog = {"links": [{"rel": "root", "href": "http://localhost/stac/catalog.json"}]} - with patch("climate_api.client.httpx.get", return_value=_make_response(catalog)): - result = list_datasets("http://localhost") - - assert result == [] - - -def test_list_datasets_raises_on_http_error() -> None: - with patch("climate_api.client.httpx.get") as mock_get: - mock_get.return_value.raise_for_status.side_effect = httpx.HTTPStatusError( - "404", request=MagicMock(), response=MagicMock() - ) - with pytest.raises(httpx.HTTPStatusError): - list_datasets("http://localhost") - - -# ── module-level open_dataset ────────────────────────────────────────────────── - - -def test_open_dataset_fetches_collection_and_opens_zarr(tmp_path: Path) -> None: - zarr_path = tmp_path / "test.zarr" - ds = xr.Dataset( - {"precip": (["time", "latitude", "longitude"], np.ones((2, 3, 3), dtype="float32"))}, - coords={ - "time": pd.date_range("2024-01-01", periods=2, freq="D"), - "latitude": [3.0, 2.0, 1.0], - "longitude": [10.0, 11.0, 12.0], - }, - ) - ds.to_zarr(str(zarr_path), mode="w", consolidated=True) - - collection = _make_collection(str(zarr_path)) - with patch("climate_api.client.httpx.get", return_value=_make_response(collection)): - result = open_dataset("chirps3_precipitation_daily_rwa", base_url="http://localhost") - - try: - assert "precip" in result.data_vars - assert result.sizes["time"] == 2 - assert "latitude" in result.coords - assert "longitude" in result.coords - finally: - result.close() - - -def test_open_dataset_raises_on_http_error() -> None: - with patch("climate_api.client.httpx.get") as mock_get: - mock_get.return_value.raise_for_status.side_effect = httpx.HTTPStatusError( - "404", request=MagicMock(), response=MagicMock() - ) - with pytest.raises(httpx.HTTPStatusError): - open_dataset("nonexistent", base_url="http://localhost") - - -def test_open_dataset_uses_default_base_url() -> None: - collection = _make_collection("/dev/null") - with patch("climate_api.client.httpx.get", return_value=_make_response(collection)) as mock_get: - with patch("climate_api.client.xr.open_zarr", return_value=MagicMock()): - open_dataset("any_dataset") - - mock_get.assert_called_once_with("http://127.0.0.1:8000/stac/collections/any_dataset", timeout=30) - - -def test_open_dataset_uses_env_var_base_url(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setenv("CLIMATE_API_BASE_URL", "http://env-host:9000") - collection = _make_collection("/dev/null") - with patch("climate_api.client.httpx.get", return_value=_make_response(collection)) as mock_get: - with patch("climate_api.client.xr.open_zarr", return_value=MagicMock()): - open_dataset("any_dataset") - - mock_get.assert_called_once_with("http://env-host:9000/stac/collections/any_dataset", timeout=30) - - # ── Client class ─────────────────────────────────────────────────────────────── diff --git a/tests/test_datasets.py b/tests/test_datasets.py index f036bdc3..d7cee416 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -134,17 +134,13 @@ def test_dataset_links_include_stac_for_published_icechunk() -> None: assert any(link.rel == "stac" and link.href == "/stac/collections/chirps3_precipitation_daily" for link in links) -def test_dataset_links_omit_stac_for_unpublished_or_netcdf() -> None: +def test_dataset_links_omit_stac_for_unpublished() -> None: unpublished = _artifact(artifact_id="a1") unpublished.publication.status = PublicationStatus.UNPUBLISHED - netcdf = _artifact(artifact_id="a2") - netcdf.format = ArtifactFormat.NETCDF unpublished_links = services._dataset_links("chirps3_precipitation_daily", unpublished) - netcdf_links = services._dataset_links("chirps3_precipitation_daily", netcdf) assert all(link.rel != "stac" for link in unpublished_links) - assert all(link.rel != "stac" for link in netcdf_links) def test_list_ingestions_returns_most_recent_first(monkeypatch: pytest.MonkeyPatch) -> None: @@ -213,7 +209,7 @@ def test_find_existing_artifact_ignores_record_with_overwide_coverage() -> None: records=[stale_artifact, valid_artifact], dataset_id="chirps3_precipitation_daily", request_scope=request_scope, - prefer_zarr=True, + ) assert result == valid_artifact @@ -237,7 +233,7 @@ def test_find_existing_artifact_ignores_stale_record(monkeypatch: pytest.MonkeyP records=[stale_artifact, valid_artifact], dataset_id="chirps3_precipitation_daily", request_scope=request_scope, - prefer_zarr=True, + ) assert result == valid_artifact @@ -304,7 +300,7 @@ def test_create_artifact_rejects_partial_download_scope(monkeypatch: pytest.Monk download_end="2026-02-10", bbox=[1.0, 2.0, 3.0, 4.0], overwrite=False, - prefer_zarr=True, + publish=False, ) @@ -331,7 +327,7 @@ def test_create_artifact_rejects_download_scope_outside_request_scope(monkeypatc download_end="2026-02-11", bbox=[1.0, 2.0, 3.0, 4.0], overwrite=False, - prefer_zarr=True, + publish=False, ) @@ -424,13 +420,3 @@ def fake_serve_icechunk_key(dataset_id: str, art: ArtifactRecord, relative_path: assert served_keys == ["t2m/zarr.json"] -def test_get_zarr_store_info_raises_409_for_netcdf_artifact(monkeypatch: pytest.MonkeyPatch) -> None: - netcdf = _artifact(artifact_id="a1") - netcdf = netcdf.model_copy(update={"format": ArtifactFormat.NETCDF}) - - monkeypatch.setattr(services, "get_latest_artifact_for_dataset_or_404", lambda _: netcdf) - - with pytest.raises(services.HTTPException) as exc_info: - services.get_dataset_zarr_store_info_or_404("chirps3_precipitation_daily") - - assert exc_info.value.status_code == 409 diff --git a/tests/test_datasets_sync.py b/tests/test_datasets_sync.py index 2a3a3694..19019807 100644 --- a/tests/test_datasets_sync.py +++ b/tests/test_datasets_sync.py @@ -104,7 +104,7 @@ def test_sync_dataset_returns_up_to_date_when_no_new_period_is_due(monkeypatch: ) monkeypatch.setattr(services, "get_dataset_or_404", lambda _: _dataset_detail(dataset_id)) - result = services.sync_dataset(dataset_id=dataset_id, end="2026-01-31", prefer_zarr=True, publish=True) + result = services.sync_dataset(dataset_id=dataset_id, end="2026-01-31", publish=True) assert result.sync_id is None assert result.status == "up_to_date" @@ -137,7 +137,7 @@ def fake_create_artifact(**kwargs: object) -> ArtifactRecord: monkeypatch.setattr(services, "create_artifact", fake_create_artifact) monkeypatch.setattr(services, "get_dataset_or_404", lambda _: _dataset_detail(dataset_id)) - result = services.sync_dataset(dataset_id=dataset_id, end="2026-02-10", prefer_zarr=True, publish=True) + result = services.sync_dataset(dataset_id=dataset_id, end="2026-02-10", publish=True) assert captured["start"] == "2026-01-01" assert captured["end"] == "2026-02-10" @@ -186,7 +186,7 @@ def fake_create_artifact(**kwargs: object) -> ArtifactRecord: monkeypatch.setattr(services, "create_artifact", fake_create_artifact) monkeypatch.setattr(services, "get_dataset_or_404", lambda _: _dataset_detail(dataset_id)) - result = services.sync_dataset(dataset_id=dataset_id, end="2026-02-10", prefer_zarr=True, publish=True) + result = services.sync_dataset(dataset_id=dataset_id, end="2026-02-10", publish=True) assert captured["start"] == "2026-01-01" assert captured["end"] == "2026-02-10" @@ -254,7 +254,7 @@ def fake_warning(message: str, *args: object) -> None: monkeypatch.setattr(services, "get_dataset_or_404", lambda _: _dataset_detail(dataset_id)) monkeypatch.setattr(sync_engine.logger, "warning", fake_warning) - result = services.sync_dataset(dataset_id=dataset_id, end="2025", prefer_zarr=True, publish=True) + result = services.sync_dataset(dataset_id=dataset_id, end="2025", publish=True) assert "download_start" in captured assert captured["download_start"] is None @@ -278,7 +278,7 @@ def test_sync_dataset_release_policy_returns_up_to_date_when_release_matches(mon ) monkeypatch.setattr(services, "get_dataset_or_404", lambda _: _dataset_detail(dataset_id)) - result = services.sync_dataset(dataset_id=dataset_id, end="2024", prefer_zarr=True, publish=True) + result = services.sync_dataset(dataset_id=dataset_id, end="2024", publish=True) assert result.sync_id is None assert result.status == "up_to_date" @@ -350,7 +350,7 @@ def test_sync_dataset_static_policy_returns_not_syncable_without_period_arithmet monkeypatch.setattr(services, "create_artifact", lambda **_: pytest.fail("static sync should not create artifacts")) monkeypatch.setattr(services, "get_dataset_or_404", lambda _: _dataset_detail(dataset_id)) - result = services.sync_dataset(dataset_id=dataset_id, end="ignored", prefer_zarr=True, publish=True) + result = services.sync_dataset(dataset_id=dataset_id, end="ignored", publish=True) assert result.sync_id is None assert result.status == "not_syncable" @@ -507,7 +507,7 @@ def test_sync_route_executes_rematerialize_and_returns_structured_detail( response = client.post( f"/sync/{dataset_id}", - json={"end": "2026-02-10", "prefer_zarr": True, "publish": True}, + json={"end": "2026-02-10", "publish": True}, ) assert response.status_code == 200 @@ -770,7 +770,7 @@ def test_run_sync_raises_clear_error_when_append_invariants_are_missing(monkeypa latest_artifact=latest_artifact, source_dataset={"id": "chirps3_precipitation_daily", "period_type": "daily", "sync": {"kind": "temporal"}}, requested_end="2026-02-11", - prefer_zarr=True, + publish=True, create_artifact_fn=lambda **_: pytest.fail("create_artifact should not be called"), get_dataset_fn=lambda _: pytest.fail("get_dataset should not be called"), @@ -938,7 +938,7 @@ def fake_run_sync(**kwargs: object) -> SyncResponse: monkeypatch.setattr(services, "run_sync", fake_run_sync) - services.sync_dataset(dataset_id=dataset_id, end="2024-01-01T05", prefer_zarr=False, publish=False) + services.sync_dataset(dataset_id=dataset_id, end="2024-01-01T05", publish=False) assert captured["current_end"] == "2024-01-01T05" @@ -981,7 +981,7 @@ def fake_run_sync(**kwargs: object) -> SyncResponse: monkeypatch.setattr(services, "run_sync", fake_run_sync) - services.sync_dataset(dataset_id=dataset_id, end="2024-01-01T06", prefer_zarr=False, publish=False) + services.sync_dataset(dataset_id=dataset_id, end="2024-01-01T06", publish=False) assert captured["current_end"] is None diff --git a/tests/test_processing_resample.py b/tests/test_processing_resample.py index 5374fe2a..66b18d76 100644 --- a/tests/test_processing_resample.py +++ b/tests/test_processing_resample.py @@ -42,7 +42,6 @@ def _artifact( dataset_name=dataset_id, variable="value", format=ArtifactFormat.ZARR, - path=str(path), asset_paths=[str(path)], variables=["value"], request_scope=ArtifactRequestScope( @@ -110,7 +109,7 @@ def test_materialize_resampled_artifact_builds_daily_dataset_from_hourly_source( assert artifact.dataset_id == "era5land_temperature_hourly_1d_mean" assert artifact.coverage.temporal.start == "2026-01-01" assert artifact.coverage.temporal.end == "2026-01-02" - result = xr.open_zarr(artifact.path, consolidated=True) + result = xr.open_zarr(artifact.asset_paths[0], consolidated=True) try: assert result["value"].shape == (2, 1, 1) assert result["value"].values[:, 0, 0].tolist() == [11.5, 35.5] @@ -164,7 +163,7 @@ def test_materialize_resampled_artifact_supports_custom_frequency_dekadal( assert artifact.dataset_id == "chirps3_precipitation_daily_10d_sum" assert artifact.coverage.temporal.start == "2026-01-01" - result = xr.open_zarr(artifact.path, consolidated=True) + result = xr.open_zarr(artifact.asset_paths[0], consolidated=True) try: assert result["value"].values[:, 0, 0].tolist() == [10.0] finally: @@ -184,42 +183,6 @@ def test_materialize_resampled_artifact_returns_404_when_source_dataset_template ) -def test_materialize_resampled_artifact_returns_409_when_source_is_netcdf( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - source_artifact = _artifact( - artifact_id="source-netcdf", - dataset_id="era5land_temperature_hourly", - managed_dataset_id="era5land_temperature_hourly_sle", - path=tmp_path / "source.nc", - start="2026-01-01", - end="2026-01-02", - ) - source_artifact = source_artifact.model_copy(update={"format": ArtifactFormat.NETCDF}) - - monkeypatch.setattr( - resample.registry_datasets, - "get_dataset", - lambda dataset_id: {"id": dataset_id, "period_type": "daily"}, - ) - monkeypatch.setattr( - resample.ingestion_services, - "get_latest_artifact_for_dataset_or_404", - lambda _: source_artifact, - ) - - with pytest.raises(resample.HTTPException, match="Zarr or Icechunk"): - resample.materialize_resampled_artifact( - source_dataset_id="era5land_temperature_hourly", - frequency="1D", - method="mean", - start="2026-01-01", - end="2026-01-02", - overwrite=False, - publish=False, - ) - def test_materialize_resampled_artifact_reads_icechunk_source( monkeypatch: pytest.MonkeyPatch, @@ -316,7 +279,7 @@ def test_materialize_resampled_artifact_drops_incomplete_trailing_week( # W03 (Jan 12-18) is incomplete — only W02 (Jan 5-11) is covered fully assert artifact.coverage.temporal.start == "2026-W02" assert artifact.coverage.temporal.end == "2026-W02" - result = xr.open_zarr(artifact.path, consolidated=True) + result = xr.open_zarr(artifact.asset_paths[0], consolidated=True) try: assert result["value"].values[:, 0, 0].tolist() == [7.0] finally: @@ -370,7 +333,7 @@ def test_materialize_resampled_artifact_drops_incomplete_leading_week( # W02 (Jan 5-11) starts Wednesday Jan 7 — incomplete leading week dropped assert artifact.coverage.temporal.start == "2026-W03" assert artifact.coverage.temporal.end == "2026-W03" - result = xr.open_zarr(artifact.path, consolidated=True) + result = xr.open_zarr(artifact.asset_paths[0], consolidated=True) try: assert result["value"].values[:, 0, 0].tolist() == [7.0] finally: @@ -473,7 +436,7 @@ def test_materialize_resampled_artifact_builds_monthly_dataset_from_daily_source # Monthly resampled timestamp is the start of the month assert artifact.coverage.temporal.start == "2026-01" assert artifact.coverage.temporal.end == "2026-01" - result = xr.open_zarr(artifact.path, consolidated=True) + result = xr.open_zarr(artifact.asset_paths[0], consolidated=True) try: assert result["value"].values[:, 0, 0].tolist() == [31.0] finally: @@ -530,7 +493,7 @@ def test_materialize_resampled_artifact_keeps_complete_week_for_daily_non_midnig assert artifact.coverage.temporal.start == "2026-W02" assert artifact.coverage.temporal.end == "2026-W02" - result = xr.open_zarr(artifact.path, consolidated=True) + result = xr.open_zarr(artifact.asset_paths[0], consolidated=True) try: assert result["value"].values[:, 0, 0].tolist() == [7.0] finally: @@ -787,7 +750,7 @@ def test_materialize_resampled_artifact_rematerializes_when_overwrite_is_true( ) assert second.artifact_id == first.artifact_id - result = xr.open_zarr(second.path, consolidated=True) + result = xr.open_zarr(second.asset_paths[0], consolidated=True) try: assert result["value"].values[:, 0, 0].tolist() == [35.5] finally: diff --git a/tests/test_stac.py b/tests/test_stac.py index 9084b136..37c491ea 100644 --- a/tests/test_stac.py +++ b/tests/test_stac.py @@ -25,7 +25,7 @@ @pytest.fixture(autouse=True) def _clear_xstac_collection_cache() -> None: - stac_services._clear_xstac_collection_cache() + stac_services._xstac_collection_cache.clear() def _artifact( @@ -100,14 +100,13 @@ def test_catalog_self_link_reflects_request_path(client: TestClient, monkeypatch assert payload["links"][0]["href"].endswith("/stac") -def test_catalog_excludes_unpublished_and_netcdf(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: +def test_catalog_excludes_unpublished(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr( ingestion_services, "list_artifacts", lambda: SimpleNamespace( items=[ _artifact(artifact_id="a1", status=PublicationStatus.UNPUBLISHED), - _artifact(artifact_id="a2", format=ArtifactFormat.NETCDF), ] ), ) From 1cfc7a12624babe62950aadbb517901b6f2e7549 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 13:37:23 +0200 Subject: [PATCH 47/80] refactor: sync IngestionPlugin protocol, enumerate_periods utility, clean public API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Protocol methods (probe/periods/fetch_period) are now sync — the orchestrator wraps them in asyncio.to_thread. Removes per-plugin ThreadPoolExecutor boilerplate. enumerate_periods() added to protocol.py: handles daily/hourly/monthly/yearly enumeration with an optional cutoff date. CHIRPS3 and ERA5-Land use it from periods(); WorldPop keeps inline year-range logic. _probe_estimate and _fetch_sync inlined into the public Protocol methods for CHIRPS3 and WorldPop — tests updated to call probe/fetch_period directly. Fixes one ruff line-length violation in services.py; removes unused imports. Docs updated: async removed from plugin skeleton in extensibility.md and adding_custom_datasets.md; enumerate_periods shown in skeleton example. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/orchestrator.py | 6 +- climate_api/ingest/plugins/chirps3.py | 92 +++++++------------- climate_api/ingest/plugins/era5_land.py | 107 +++++++----------------- climate_api/ingest/plugins/worldpop.py | 88 ++++++++----------- climate_api/ingest/protocol.py | 73 +++++++++++++++- climate_api/ingestions/routes.py | 2 +- climate_api/ingestions/services.py | 8 +- docs/adding_custom_datasets.md | 25 ++---- docs/extensibility.md | 6 +- tests/test_ingest_orchestrator.py | 24 +++--- tests/test_ingest_plugins.py | 83 +++++++----------- 11 files changed, 231 insertions(+), 283 deletions(-) diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py index 5e145dfa..59e0b463 100644 --- a/climate_api/ingest/orchestrator.py +++ b/climate_api/ingest/orchestrator.py @@ -119,10 +119,10 @@ async def run_ingest( concurrently. Writes are always sequential: tasks are awaited in chronological order so the time axis stays sorted. """ - spec: GridSpec = await plugin.probe(bbox, **params) + spec: GridSpec = await asyncio.to_thread(plugin.probe, bbox, **params) logger.info("Probe: shape=%s crs=EPSG:%d time_dim=%s", spec.shape, spec.crs, spec.time_dim) - all_periods = await plugin.periods(start, end) + all_periods = plugin.periods(start, end) if not all_periods: logger.info("No periods available for range %s..%s", start, end) return @@ -162,7 +162,7 @@ async def run_ingest( async def _fetch(period_id: str) -> xr.Dataset: async with semaphore: - return await plugin.fetch_period(period_id, bbox, **params) + return await asyncio.to_thread(plugin.fetch_period, period_id, bbox, **params) # Create all tasks upfront so up to max_concurrency fetches start immediately. # Await in chronological order so writes are always sequential. diff --git a/climate_api/ingest/plugins/chirps3.py b/climate_api/ingest/plugins/chirps3.py index cc76d61a..81d07d35 100644 --- a/climate_api/ingest/plugins/chirps3.py +++ b/climate_api/ingest/plugins/chirps3.py @@ -18,17 +18,15 @@ from __future__ import annotations -import asyncio import calendar import logging -from concurrent.futures import ThreadPoolExecutor -from datetime import date, timedelta +from datetime import date from typing import Any import numpy as np import xarray as xr -from climate_api.ingest.protocol import GridSpec +from climate_api.ingest.protocol import GridSpec, enumerate_periods logger = logging.getLogger(__name__) @@ -38,9 +36,6 @@ # After the 20th of a month, the previous month is considered complete _COMPLETE_AFTER_DAY = 20 -_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="chirps3") - - class Chirps3Plugin: """IngestionPlugin for CHIRPS v3 daily precipitation. @@ -69,33 +64,25 @@ def __init__(self, stage: str = "final", flavor: str = "rnl") -> None: # Protocol implementation # ------------------------------------------------------------------ - async def probe(self, bbox: list[float], **_: Any) -> GridSpec: - """Estimate grid spec from known CHIRPS3 resolution — no data transfer.""" - return self._probe_estimate(bbox) - - async def periods(self, start: str, end: str) -> list[str]: - return self._build_periods(start, end) - - async def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: - return await asyncio.get_running_loop().run_in_executor(_executor, self._fetch_sync, period_id, bbox) - - # ------------------------------------------------------------------ - # Sync helpers (run inside the thread pool) - # ------------------------------------------------------------------ + def probe(self, bbox: list[float], **_: Any) -> GridSpec: + """Derive GridSpec from CHIRPS3's known 0.05° resolution — no data transfer.""" + import math - def _url_for_day(self, d: date) -> str: - if self.stage == "final": - return ( - f"https://data.chc.ucsb.edu/products/CHIRPS/v3.0/daily/final/" - f"{self.flavor}/cogs/{d.year}/" - f"chirps-v3.0.{self.flavor}.{d.year}.{d.month:02d}.{d.day:02d}.cog" - ) - return ( - f"https://data.chc.ucsb.edu/products/CHIRPS/v3.0/daily/prelim/sat/" - f"{d.year}/chirps-v3.0.prelim.{d.year}.{d.month:02d}.{d.day:02d}.tif" + xmin, ymin, xmax, ymax = map(float, bbox) + nx = max(1, math.ceil((xmax - xmin) / _CHIRPS3_RES_DEG)) + ny = max(1, math.ceil((ymax - ymin) / _CHIRPS3_RES_DEG)) + return GridSpec( + shape=(ny, nx), + crs=4326, + dtype=np.dtype("float32"), + nodata=_CHIRPS3_NODATA, + time_dim=True, ) - def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: + def periods(self, start: str, end: str) -> list[str]: + return enumerate_periods(start, end, "daily", cutoff=self._availability_cutoff()) + + def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: """Fetch one day via COG range request, clip to bbox, return as Dataset.""" import rioxarray @@ -116,25 +103,22 @@ def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: ds = da.to_dataset(name="precip") return ds.expand_dims(time=[np.datetime64(period_id, "D")]) # type: ignore[no-any-return] - def _probe_estimate(self, bbox: list[float]) -> GridSpec: - """Derive GridSpec from CHIRPS3's known 0.05° resolution.""" - import math - - xmin, ymin, xmax, ymax = map(float, bbox) - nx = max(1, math.ceil((xmax - xmin) / _CHIRPS3_RES_DEG)) - ny = max(1, math.ceil((ymax - ymin) / _CHIRPS3_RES_DEG)) - return GridSpec( - shape=(ny, nx), - crs=4326, - dtype=np.dtype("float32"), - nodata=_CHIRPS3_NODATA, - time_dim=True, - ) - # ------------------------------------------------------------------ - # Period generation + # URL construction and availability # ------------------------------------------------------------------ + def _url_for_day(self, d: date) -> str: + if self.stage == "final": + return ( + f"https://data.chc.ucsb.edu/products/CHIRPS/v3.0/daily/final/" + f"{self.flavor}/cogs/{d.year}/" + f"chirps-v3.0.{self.flavor}.{d.year}.{d.month:02d}.{d.day:02d}.cog" + ) + return ( + f"https://data.chc.ucsb.edu/products/CHIRPS/v3.0/daily/prelim/sat/" + f"{d.year}/chirps-v3.0.prelim.{d.year}.{d.month:02d}.{d.day:02d}.tif" + ) + def _availability_cutoff(self) -> date: """Return the last day of the most recent complete published month.""" today = date.today() @@ -146,17 +130,3 @@ def _availability_cutoff(self) -> date: m, y = 12, y - 1 last_day = calendar.monthrange(y, m)[1] return date(y, m, last_day) - - def _build_periods(self, start: str, end: str) -> list[str]: - """Return daily ISO-date strings from start to end, clamped to availability.""" - cutoff = self._availability_cutoff() - start_date = date.fromisoformat(start[:10]) - end_date = min(date.fromisoformat(end[:10]), cutoff) - if start_date > end_date: - return [] - periods: list[str] = [] - current = start_date - while current <= end_date: - periods.append(current.isoformat()) - current += timedelta(days=1) - return periods diff --git a/climate_api/ingest/plugins/era5_land.py b/climate_api/ingest/plugins/era5_land.py index 15a14187..3051d1e5 100644 --- a/climate_api/ingest/plugins/era5_land.py +++ b/climate_api/ingest/plugins/era5_land.py @@ -10,17 +10,14 @@ from __future__ import annotations -import asyncio -import calendar import logging -from concurrent.futures import ThreadPoolExecutor from datetime import date, timedelta from typing import Any import numpy as np import xarray as xr -from climate_api.ingest.protocol import GridSpec +from climate_api.ingest.protocol import GridSpec, enumerate_periods logger = logging.getLogger(__name__) @@ -30,10 +27,6 @@ # ERA5-Land on DestinE has roughly a 15-day publication lag. _LAG_DAYS = 15 -# Thread pool shared across probe/fetch calls so async methods don't block the -# event loop while waiting for remote I/O. -_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="era5land") - class Era5LandPlugin: """IngestionPlugin for ERA5-Land hourly data from DestinE Earth Data Hub. @@ -53,44 +46,8 @@ def __init__(self, variable: str) -> None: # Protocol implementation # ------------------------------------------------------------------ - async def probe(self, bbox: list[float], **_: Any) -> GridSpec: + def probe(self, bbox: list[float], **_: Any) -> GridSpec: """Open the remote zarr metadata-only and return the grid spec for bbox.""" - return await asyncio.get_running_loop().run_in_executor(_executor, self._probe_sync, bbox) - - async def periods(self, start: str, end: str) -> list[str]: - """Return hourly period IDs available within the provider's lag window.""" - return self._build_periods(start, end) - - async def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: - """Fetch one hourly period from the remote zarr store.""" - return await asyncio.get_running_loop().run_in_executor(_executor, self._fetch_sync, period_id, bbox) - - # ------------------------------------------------------------------ - # Sync helpers (run inside the thread pool) - # ------------------------------------------------------------------ - - def _open_remote(self) -> xr.Dataset: - return xr.open_dataset( - _DESTINE_ZARR_URL, - engine="zarr", - storage_options=_STORAGE_OPTIONS, - chunks={}, - )[[self.variable]] - - def _correct_longitude(self, ds: xr.Dataset) -> xr.Dataset: - """Unwrap 0–360 longitude to −180–180 and sort.""" - return ds.assign_coords(longitude=((ds.longitude + 180) % 360 - 180)).sortby("longitude") - - def _select_bbox(self, ds: xr.Dataset, bbox: list[float]) -> xr.Dataset: - xmin, ymin, xmax, ymax = map(float, bbox) - lon_res = float(abs(ds.longitude.diff("longitude").median())) - lat_res = float(abs(ds.latitude.diff("latitude").median())) - return ds.sel( - longitude=slice(xmin - lon_res, xmax + lon_res), - latitude=slice(ymax + lat_res, ymin - lat_res), - ) - - def _probe_sync(self, bbox: list[float]) -> GridSpec: ds = self._open_remote() ds = self._correct_longitude(ds) ds = self._select_bbox(ds, bbox) @@ -107,8 +64,13 @@ def _probe_sync(self, bbox: list[float]) -> GridSpec: y_dim="y", ) - def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: - """Fetch one hourly period: remote zarr → bbox clip → load → return.""" + def periods(self, start: str, end: str) -> list[str]: + """Return hourly period IDs available within the provider's lag window.""" + cutoff = date.today() - timedelta(days=_LAG_DAYS) + return enumerate_periods(start, end, "hourly", cutoff=cutoff) + + def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: + """Fetch one hourly period from the remote zarr store.""" hour = int(period_id[-2:]) if len(period_id) > 10 else 0 date_part = period_id[:10] @@ -128,37 +90,26 @@ def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: return ds # ------------------------------------------------------------------ - # Period generation + # Helpers # ------------------------------------------------------------------ - def _build_periods(self, start: str, end: str) -> list[str]: - """Generate hourly period IDs, clamped to the provider's availability lag. + def _open_remote(self) -> xr.Dataset: + return xr.open_dataset( + _DESTINE_ZARR_URL, + engine="zarr", + storage_options=_STORAGE_OPTIONS, + chunks={}, + )[[self.variable]] - start and end are period-ID strings of the form 'YYYY-MM-DDTHH'. - The comparison is lexicographic so the filter respects the hour component. - """ - cutoff = date.today() - timedelta(days=_LAG_DAYS) - start_dt = date.fromisoformat(start[:10]) - end_dt = min(date.fromisoformat(end[:10]), cutoff) - # Cutoff clamped to end-of-day of the cutoff date so we filter later. - cutoff_period = f"{cutoff.isoformat()}T23" - - periods: list[str] = [] - current = start_dt - while current <= end_dt: - _, last_day = calendar.monthrange(current.year, current.month) - for day_num in range(1, last_day + 1): - d = current.replace(day=day_num) - if d < start_dt or d > end_dt: - continue - for hour in range(24): - p = f"{d.isoformat()}T{hour:02d}" - if p < start or p > end or p > cutoff_period: - continue - periods.append(p) - if current.month == 12: - current = current.replace(year=current.year + 1, month=1, day=1) - else: - current = current.replace(month=current.month + 1, day=1) - - return periods + def _correct_longitude(self, ds: xr.Dataset) -> xr.Dataset: + """Unwrap 0–360 longitude to −180–180 and sort.""" + return ds.assign_coords(longitude=((ds.longitude + 180) % 360 - 180)).sortby("longitude") + + def _select_bbox(self, ds: xr.Dataset, bbox: list[float]) -> xr.Dataset: + xmin, ymin, xmax, ymax = map(float, bbox) + lon_res = float(abs(ds.longitude.diff("longitude").median())) + lat_res = float(abs(ds.latitude.diff("latitude").median())) + return ds.sel( + longitude=slice(xmin - lon_res, xmax + lon_res), + latitude=slice(ymax + lat_res, ymin - lat_res), + ) diff --git a/climate_api/ingest/plugins/worldpop.py b/climate_api/ingest/plugins/worldpop.py index 69691dc9..7bc7310d 100644 --- a/climate_api/ingest/plugins/worldpop.py +++ b/climate_api/ingest/plugins/worldpop.py @@ -12,11 +12,9 @@ from __future__ import annotations -import asyncio import io import logging import math -from concurrent.futures import ThreadPoolExecutor from typing import Any import numpy as np @@ -29,9 +27,6 @@ # WorldPop Global2 at 100m: 3 arc-seconds = 1/1200 degree per pixel _WORLDPOP_RES_DEG = 1.0 / 1200 -_executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="worldpop") - - class WorldPopPlugin: """IngestionPlugin for WorldPop yearly population count data. @@ -62,40 +57,34 @@ def __init__(self, country_code: str = "", version: str = "global2") -> None: # Protocol implementation # ------------------------------------------------------------------ - async def probe(self, bbox: list[float], **_: Any) -> GridSpec: - """Estimate grid spec from known WorldPop resolution — no data transfer.""" - return self._probe_estimate(bbox) - - async def periods(self, start: str, end: str) -> list[str]: - return self._build_periods(start, end) - - async def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: - return await asyncio.get_running_loop().run_in_executor(_executor, self._fetch_sync, int(period_id), bbox) - - # ------------------------------------------------------------------ - # Sync helpers (run inside the thread pool) - # ------------------------------------------------------------------ + def probe(self, bbox: list[float], **_: Any) -> GridSpec: + """Derive GridSpec from WorldPop's known 3 arc-second resolution — no data transfer.""" + xmin, ymin, xmax, ymax = map(float, bbox) + nx = max(1, math.ceil((xmax - xmin) / _WORLDPOP_RES_DEG)) + ny = max(1, math.ceil((ymax - ymin) / _WORLDPOP_RES_DEG)) + return GridSpec( + shape=(ny, nx), + crs=4326, + dtype=np.dtype("float32"), + nodata=float("nan"), + time_dim=True, + ) - def _url_for_year(self, year: int) -> str: - cc = self.country_code - if self.version == "global2": - filename = f"{cc.lower()}_pop_{year}_CN_100m_R2025A_v1.tif" - return ( - f"https://data.worldpop.org/GIS/Population/Global_2015_2030/R2025A/" - f"{year}/{cc}/v1/100m/constrained/{filename}" - ) - if self.version == "global1": - filename = f"{cc.lower()}_ppp_{year}_UNadj.tif" - return f"https://data.worldpop.org/GIS/Population/Global_2000_2020/{year}/{cc}/{filename}" - raise ValueError(f"Unknown WorldPop version: {self.version!r}") + def periods(self, start: str, end: str) -> list[str]: + """Return year strings in [start, end] clamped to version availability.""" + start_year = int(start[:4]) + end_year = int(end[:4]) + valid_range = (2015, 2030) if self.version == "global2" else (2000, 2020) + return [str(y) for y in range(max(start_year, valid_range[0]), min(end_year, valid_range[1]) + 1)] - def _fetch_sync(self, year: int, bbox: list[float]) -> xr.Dataset: + def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: """Download a per-country GeoTIFF, clip to bbox, return as Dataset.""" import requests import rioxarray + year = int(period_id) url = self._url_for_year(year) - logger.info("Fetching WorldPop %s %d: %s", self.country_code, year, url) + logger.info("Fetching WorldPop %s %s: %s", self.country_code, period_id, url) resp = requests.get(url, timeout=300) resp.raise_for_status() @@ -112,28 +101,21 @@ def _fetch_sync(self, year: int, bbox: list[float]) -> xr.Dataset: da = da.load() ds = da.to_dataset(name="pop_total") - return ds.expand_dims(time=[np.datetime64(f"{year}-01-01", "D")]) # type: ignore[no-any-return] - - def _probe_estimate(self, bbox: list[float]) -> GridSpec: - """Derive GridSpec from WorldPop's known 3 arc-second resolution.""" - xmin, ymin, xmax, ymax = map(float, bbox) - nx = max(1, math.ceil((xmax - xmin) / _WORLDPOP_RES_DEG)) - ny = max(1, math.ceil((ymax - ymin) / _WORLDPOP_RES_DEG)) - return GridSpec( - shape=(ny, nx), - crs=4326, - dtype=np.dtype("float32"), - nodata=float("nan"), - time_dim=True, - ) + return ds.expand_dims(time=[np.datetime64(f"{period_id}-01-01", "D")]) # type: ignore[no-any-return] # ------------------------------------------------------------------ - # Period generation + # URL construction # ------------------------------------------------------------------ - def _build_periods(self, start: str, end: str) -> list[str]: - """Return year strings in [start, end] clamped to version availability.""" - start_year = int(start[:4]) - end_year = int(end[:4]) - valid_range = (2015, 2030) if self.version == "global2" else (2000, 2020) - return [str(y) for y in range(max(start_year, valid_range[0]), min(end_year, valid_range[1]) + 1)] + def _url_for_year(self, year: int) -> str: + cc = self.country_code + if self.version == "global2": + filename = f"{cc.lower()}_pop_{year}_CN_100m_R2025A_v1.tif" + return ( + f"https://data.worldpop.org/GIS/Population/Global_2015_2030/R2025A/" + f"{year}/{cc}/v1/100m/constrained/{filename}" + ) + if self.version == "global1": + filename = f"{cc.lower()}_ppp_{year}_UNadj.tif" + return f"https://data.worldpop.org/GIS/Population/Global_2000_2020/{year}/{cc}/{filename}" + raise ValueError(f"Unknown WorldPop version: {self.version!r}") diff --git a/climate_api/ingest/protocol.py b/climate_api/ingest/protocol.py index 95cf269f..0433b7c6 100644 --- a/climate_api/ingest/protocol.py +++ b/climate_api/ingest/protocol.py @@ -3,6 +3,7 @@ from __future__ import annotations from dataclasses import dataclass, field +from datetime import date, timedelta from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable import numpy as np @@ -76,19 +77,20 @@ class IngestionPlugin(Protocol): max_concurrency: int commit_batch_size: int - async def probe(self, bbox: list[float], **params: Any) -> GridSpec: + def probe(self, bbox: list[float], **params: Any) -> GridSpec: """Metadata-only source probe. Returns grid spec. No data transfer.""" ... - async def periods(self, start: str, end: str) -> list[str]: + def periods(self, start: str, end: str) -> list[str]: """Return the ordered list of available period IDs from start to end. May query the upstream source to confirm which periods are published. The orchestrator uses the length of this list for progress reporting. + Use enumerate_periods() as a helper for standard daily/hourly/yearly types. """ ... - async def fetch_period(self, period_id: str, bbox: list[float], **params: Any) -> "xr.Dataset": + def fetch_period(self, period_id: str, bbox: list[float], **params: Any) -> "xr.Dataset": """Fetch one period. Return a dataset in the source CRS. The returned dataset must have a 'time' dimension with a single @@ -96,3 +98,68 @@ async def fetch_period(self, period_id: str, bbox: list[float], **params: Any) - The orchestrator handles zarr writes — never call to_zarr here. """ ... + + +def enumerate_periods(start: str, end: str, period_type: str, cutoff: date | None = None) -> list[str]: + """Generate ordered period IDs for [start, end], optionally clamped to cutoff. + + period_type values and ID formats: + 'daily' → YYYY-MM-DD + 'hourly' → YYYY-MM-DDTHH + 'monthly' → YYYY-MM + 'yearly' → YYYY + + cutoff clips the end of the range to the last period on or before that date. + For 'hourly', the cutoff is inclusive through the final hour of the cutoff date. + """ + if period_type == "daily": + s = date.fromisoformat(start[:10]) + e = date.fromisoformat(end[:10]) + if cutoff: + e = min(e, cutoff) + result: list[str] = [] + cur = s + while cur <= e: + result.append(cur.isoformat()) + cur += timedelta(days=1) + return result + + if period_type == "hourly": + cap = f"{cutoff.isoformat()}T23" if cutoff else None + eff_end = min(end, cap) if cap else end + if start > eff_end: + return [] + result = [] + cur = date.fromisoformat(start[:10]) + end_date = date.fromisoformat(eff_end[:10]) + while cur <= end_date: + for h in range(24): + p = f"{cur.isoformat()}T{h:02d}" + if p < start or p > eff_end: + continue + result.append(p) + cur += timedelta(days=1) + return result + + if period_type == "monthly": + sy, sm = int(start[:4]), int(start[5:7]) if len(start) >= 7 else 1 + ey, em = int(end[:4]), int(end[5:7]) if len(end) >= 7 else 12 + if cutoff: + ey, em = min((ey, em), (cutoff.year, cutoff.month)) + result = [] + y, m = sy, sm + while (y, m) <= (ey, em): + result.append(f"{y:04d}-{m:02d}") + m += 1 + if m > 12: + m, y = 1, y + 1 + return result + + if period_type == "yearly": + sy = int(start[:4]) + ey = int(end[:4]) + if cutoff: + ey = min(ey, cutoff.year) + return [str(y) for y in range(sy, ey + 1)] + + raise ValueError(f"Unknown period_type: {period_type!r}") diff --git a/climate_api/ingestions/routes.py b/climate_api/ingestions/routes.py index 8f2c52d8..634df817 100644 --- a/climate_api/ingestions/routes.py +++ b/climate_api/ingestions/routes.py @@ -2,7 +2,7 @@ from typing import Any -from fastapi import APIRouter, Header, HTTPException +from fastapi import APIRouter, Header from starlette.responses import Response from climate_api.data_registry.routes import _get_dataset_or_404 diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 55b5114e..aed2bb05 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -7,9 +7,9 @@ import mimetypes import os from collections.abc import Callable -from typing import Any from datetime import UTC, datetime from pathlib import Path +from typing import Any from uuid import uuid4 import portalocker @@ -1127,7 +1127,11 @@ def _dataset_links(dataset_id: str, latest: ArtifactRecord) -> list[DatasetAcces DatasetAccessLink(href=f"/datasets/{dataset_id}", rel="self", title="Dataset detail"), DatasetAccessLink(href=f"/zarr/{dataset_id}", rel="zarr", title="Zarr store"), ] - if latest.publication.status == PublicationStatus.PUBLISHED and latest.format in {ArtifactFormat.ZARR, ArtifactFormat.ICECHUNK}: + is_published_store = ( + latest.publication.status == PublicationStatus.PUBLISHED + and latest.format in {ArtifactFormat.ZARR, ArtifactFormat.ICECHUNK} + ) + if is_published_store: links.append(DatasetAccessLink(href=f"/stac/collections/{dataset_id}", rel="stac", title="STAC collection")) if latest.publication.pygeoapi_path is not None: links.append( diff --git a/docs/adding_custom_datasets.md b/docs/adding_custom_datasets.md index 3804987b..e862fe41 100644 --- a/docs/adding_custom_datasets.md +++ b/docs/adding_custom_datasets.md @@ -192,25 +192,22 @@ For sources that need streaming access or resumable long ingests, implement an ` from __future__ import annotations import asyncio -from concurrent.futures import ThreadPoolExecutor from typing import Any import numpy as np import xarray as xr -from climate_api.ingest.protocol import GridSpec - -_executor = ThreadPoolExecutor(max_workers=2) +from climate_api.ingest.protocol import GridSpec, enumerate_periods class MyPlugin: max_concurrency = 2 # fetch this many periods in parallel - commit_batch_size = 1 # commit every N periods + commit_batch_size = 1 # cursor checkpoint interval def __init__(self, variable: str) -> None: self.variable = variable - async def probe(self, bbox: list[float], **_: Any) -> GridSpec: + def probe(self, bbox: list[float], **_: Any) -> GridSpec: """Return grid shape and CRS without downloading data.""" # Derive shape from known resolution, or open a small metadata request. xmin, ymin, xmax, ymax = bbox @@ -220,19 +217,15 @@ class MyPlugin: ny = max(1, math.ceil((ymax - ymin) / res)) return GridSpec(shape=(ny, nx), crs=4326, dtype=np.dtype("float32"), nodata=-9999.0) - async def periods(self, start: str, end: str) -> list[str]: + def periods(self, start: str, end: str) -> list[str]: """Return the ordered list of period IDs to fetch.""" - # Return ISO date strings, month strings, year strings, etc. - return ["2024-01-01", "2024-01-02"] # replace with real logic + # enumerate_periods handles daily/hourly/monthly/yearly enumeration and + # optional availability cutoff clamping. + return enumerate_periods(start, end, "daily") - async def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: + def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: """Fetch one period. Must return a Dataset with a 'time' dimension.""" - return await asyncio.get_running_loop().run_in_executor( - _executor, self._fetch_sync, period_id, bbox - ) - - def _fetch_sync(self, period_id: str, bbox: list[float]) -> xr.Dataset: - # Blocking I/O in thread pool — download, clip to bbox, return Dataset. + # Blocking I/O is fine — the orchestrator runs this in asyncio.to_thread. ... ``` diff --git a/docs/extensibility.md b/docs/extensibility.md index 6e15cba1..bae7169e 100644 --- a/docs/extensibility.md +++ b/docs/extensibility.md @@ -58,15 +58,15 @@ class MyPlugin: max_concurrency: int = 1 # parallel fetch limit commit_batch_size: int = 1 # cursor checkpoint interval (every period is committed) - async def probe(self, bbox: list[float], **params) -> GridSpec: + def probe(self, bbox: list[float], **params) -> GridSpec: """Metadata-only source probe. Returns grid shape, CRS, dtype. No data transfer.""" ... - async def periods(self, start: str, end: str) -> list[str]: + def periods(self, start: str, end: str) -> list[str]: """Return the ordered list of available period IDs from start to end.""" ... - async def fetch_period(self, period_id: str, bbox: list[float], **params) -> xr.Dataset: + def fetch_period(self, period_id: str, bbox: list[float], **params) -> xr.Dataset: """Fetch one period. Return a dataset with a 'time' dimension in source CRS.""" ... ``` diff --git a/tests/test_ingest_orchestrator.py b/tests/test_ingest_orchestrator.py index 88671a53..b676bb40 100644 --- a/tests/test_ingest_orchestrator.py +++ b/tests/test_ingest_orchestrator.py @@ -50,13 +50,13 @@ def __init__(self, periods: list[str]) -> None: self._periods = periods self.fetched: list[str] = [] - async def probe(self, bbox: list[float], **params: Any) -> GridSpec: + def probe(self, bbox: list[float], **params: Any) -> GridSpec: return GridSpec(shape=(4, 4), crs=4326, dtype=np.dtype("float32"), nodata=None) - async def periods(self, start: str, end: str) -> list[str]: + def periods(self, start: str, end: str) -> list[str]: return [p for p in self._periods if start <= p <= end] - async def fetch_period(self, period_id: str, bbox: list[float], **params: Any) -> xr.Dataset: + def fetch_period(self, period_id: str, bbox: list[float], **params: Any) -> xr.Dataset: self.fetched.append(period_id) return _make_monthly_dataset(period_id) @@ -134,7 +134,7 @@ def test_run_ingest_is_idempotent(tmp_path: Path) -> None: ) # Second run fetched nothing new. - assert plugin.fetched == ["2024-01", "2024-02"] + assert sorted(plugin.fetched) == ["2024-01", "2024-02"] committed = read_committed_period_ids(store_path, "monthly") assert committed == {"2024-01", "2024-02"} @@ -339,27 +339,27 @@ def test_read_committed_period_ids_empty_when_no_store(tmp_path: Path) -> None: # --------------------------------------------------------------------------- -def test_era5land_build_periods_respects_hour_component() -> None: +def test_era5land_periods_respects_hour_component() -> None: from climate_api.ingest.plugins.era5_land import Era5LandPlugin plugin = Era5LandPlugin(variable="t2m") - periods = plugin._build_periods("2024-01-01T06", "2024-01-01T08") + periods = plugin.periods("2024-01-01T06", "2024-01-01T08") assert periods == ["2024-01-01T06", "2024-01-01T07", "2024-01-01T08"] -def test_era5land_build_periods_single_hour() -> None: +def test_era5land_periods_single_hour() -> None: from climate_api.ingest.plugins.era5_land import Era5LandPlugin plugin = Era5LandPlugin(variable="t2m") - periods = plugin._build_periods("2024-01-01T00", "2024-01-01T00") + periods = plugin.periods("2024-01-01T00", "2024-01-01T00") assert periods == ["2024-01-01T00"] -def test_era5land_build_periods_spans_months() -> None: +def test_era5land_periods_spans_months() -> None: from climate_api.ingest.plugins.era5_land import Era5LandPlugin plugin = Era5LandPlugin(variable="t2m") - periods = plugin._build_periods("2024-01-31T23", "2024-02-01T01") + periods = plugin.periods("2024-01-31T23", "2024-02-01T01") assert periods == ["2024-01-31T23", "2024-02-01T00", "2024-02-01T01"] @@ -515,10 +515,10 @@ def test_era5land_plugin_declares_rechunk_time() -> None: class FakeStaticPlugin(FakePlugin): """FakePlugin variant whose probe returns time_dim=False (static dataset).""" - async def probe(self, bbox: list[float], **params: Any) -> GridSpec: + def probe(self, bbox: list[float], **params: Any) -> GridSpec: return GridSpec(shape=(4, 4), crs=4326, dtype=np.dtype("float32"), nodata=None, time_dim=False) - async def fetch_period(self, period_id: str, bbox: list[float], **params: Any) -> xr.Dataset: + def fetch_period(self, period_id: str, bbox: list[float], **params: Any) -> xr.Dataset: self.fetched.append(period_id) return xr.Dataset( {"elevation": xr.DataArray(np.zeros((4, 4), dtype="float32"), dims=["y", "x"])}, diff --git a/tests/test_ingest_plugins.py b/tests/test_ingest_plugins.py index 0d4c6172..795cae6e 100644 --- a/tests/test_ingest_plugins.py +++ b/tests/test_ingest_plugins.py @@ -7,7 +7,6 @@ from __future__ import annotations -import asyncio import math from datetime import date from typing import Any @@ -83,40 +82,40 @@ def test_url_unknown_version_raises(self) -> None: def test_build_periods_global2_basic(self) -> None: plugin = self._make_plugin(version="global2") - periods = plugin._build_periods("2018", "2020") + periods = plugin.periods("2018", "2020") assert periods == ["2018", "2019", "2020"] def test_build_periods_single_year(self) -> None: plugin = self._make_plugin(version="global2") - assert plugin._build_periods("2023", "2023") == ["2023"] + assert plugin.periods("2023", "2023") == ["2023"] def test_build_periods_clamps_to_global2_range(self) -> None: plugin = self._make_plugin(version="global2") - periods = plugin._build_periods("2010", "2035") + periods = plugin.periods("2010", "2035") assert periods[0] == "2015" assert periods[-1] == "2030" def test_build_periods_clamps_to_global1_range(self) -> None: plugin = self._make_plugin(version="global1") - periods = plugin._build_periods("1995", "2025") + periods = plugin.periods("1995", "2025") assert periods[0] == "2000" assert periods[-1] == "2020" def test_build_periods_empty_when_out_of_range(self) -> None: plugin = self._make_plugin(version="global2") - assert plugin._build_periods("2031", "2035") == [] + assert plugin.periods("2031", "2035") == [] def test_build_periods_uses_year_prefix_only(self) -> None: # period strings like "2024-01-01" should be handled by stripping to year plugin = self._make_plugin(version="global2") - periods = plugin._build_periods("2024-01-01", "2025-12-31") + periods = plugin.periods("2024-01-01", "2025-12-31") assert periods == ["2024", "2025"] # probe / GridSpec - def test_probe_estimate_returns_gridspec(self) -> None: + def test_probe_returns_gridspec(self) -> None: plugin = self._make_plugin() - spec = plugin._probe_estimate([4.0, 57.5, 31.5, 71.5]) + spec = plugin.probe([4.0, 57.5, 31.5, 71.5]) assert isinstance(spec, GridSpec) assert spec.crs == 4326 assert spec.time_dim is True @@ -124,22 +123,13 @@ def test_probe_estimate_returns_gridspec(self) -> None: assert spec.nodata is not None and math.isnan(spec.nodata) assert spec.shape[0] > 0 and spec.shape[1] > 0 - def test_probe_estimate_shape_proportional_to_bbox(self) -> None: + def test_probe_shape_proportional_to_bbox(self) -> None: plugin = self._make_plugin() - small = plugin._probe_estimate([0.0, 0.0, 1.0, 1.0]) - large = plugin._probe_estimate([0.0, 0.0, 10.0, 10.0]) + small = plugin.probe([0.0, 0.0, 1.0, 1.0]) + large = plugin.probe([0.0, 0.0, 10.0, 10.0]) # 10x wider bbox should yield ~10x more columns assert large.shape[1] > small.shape[1] * 5 - def test_probe_is_async_and_returns_gridspec(self) -> None: - plugin = self._make_plugin() - - async def run() -> GridSpec: - return await plugin.probe([4.0, 57.5, 31.5, 71.5]) - - spec = asyncio.run(run()) - assert isinstance(spec, GridSpec) - # fetch_period (mocked network) def _make_fake_da(self, ny: int = 4, nx: int = 5) -> Any: @@ -165,7 +155,7 @@ def test_fetch_period_returns_dataset_with_time_and_pop_total(self) -> None: fake_resp.content = b"" with patch("requests.get", return_value=fake_resp), patch("rioxarray.open_rasterio", return_value=fake_da): - ds = WorldPopPlugin(country_code="NOR")._fetch_sync(2024, [4.0, 57.5, 31.5, 71.5]) + ds = WorldPopPlugin(country_code="NOR").fetch_period("2024", [4.0, 57.5, 31.5, 71.5]) assert "pop_total" in ds.data_vars assert "time" in ds.dims @@ -182,7 +172,7 @@ def test_fetch_period_returns_dataset_with_time_dim(self) -> None: fake_resp.content = b"" with patch("requests.get", return_value=fake_resp), patch("rioxarray.open_rasterio", return_value=fake_da): - ds = WorldPopPlugin(country_code="NOR")._fetch_sync(2024, [4.0, 57.5, 31.5, 71.5]) + ds = WorldPopPlugin(country_code="NOR").fetch_period("2024", [4.0, 57.5, 31.5, 71.5]) assert "time" in ds.dims assert ds.sizes["time"] == 1 @@ -262,69 +252,69 @@ def test_url_prelim_structure(self) -> None: # Period generation - def test_build_periods_returns_daily_dates(self) -> None: + def test_periods_returns_daily_dates(self) -> None: plugin = self._make_plugin() # Use a fixed cutoff by patching today with patch("climate_api.ingest.plugins.chirps3.date") as mock_date: mock_date.today.return_value = date(2024, 3, 25) # day > 20 → cutoff = end of Feb mock_date.fromisoformat = date.fromisoformat mock_date.side_effect = date - periods = plugin._build_periods("2024-02-01", "2024-03-31") + periods = plugin.periods("2024-02-01", "2024-03-31") # Cutoff: end of February 2024 (29 days — 2024 is leap) assert periods[0] == "2024-02-01" assert periods[-1] == "2024-02-29" assert len(periods) == 29 - def test_build_periods_respects_lag_before_threshold_day(self) -> None: + def test_periods_respects_lag_before_threshold_day(self) -> None: plugin = self._make_plugin() with patch("climate_api.ingest.plugins.chirps3.date") as mock_date: mock_date.today.return_value = date(2024, 3, 10) # day <= 20 → cutoff = end of Jan mock_date.fromisoformat = date.fromisoformat mock_date.side_effect = date - periods = plugin._build_periods("2024-01-01", "2024-03-31") + periods = plugin.periods("2024-01-01", "2024-03-31") assert periods[-1] == "2024-01-31" - def test_build_periods_empty_when_start_after_cutoff(self) -> None: + def test_periods_empty_when_start_after_cutoff(self) -> None: plugin = self._make_plugin() with patch("climate_api.ingest.plugins.chirps3.date") as mock_date: mock_date.today.return_value = date(2024, 3, 25) mock_date.fromisoformat = date.fromisoformat mock_date.side_effect = date - periods = plugin._build_periods("2024-03-01", "2024-03-31") + periods = plugin.periods("2024-03-01", "2024-03-31") assert periods == [] - def test_build_periods_consecutive(self) -> None: + def test_periods_consecutive(self) -> None: plugin = self._make_plugin() with patch("climate_api.ingest.plugins.chirps3.date") as mock_date: mock_date.today.return_value = date(2024, 4, 25) mock_date.fromisoformat = date.fromisoformat mock_date.side_effect = date - periods = plugin._build_periods("2024-03-01", "2024-03-05") + periods = plugin.periods("2024-03-01", "2024-03-05") assert periods == ["2024-03-01", "2024-03-02", "2024-03-03", "2024-03-04", "2024-03-05"] - def test_build_periods_single_day(self) -> None: + def test_periods_single_day(self) -> None: plugin = self._make_plugin() with patch("climate_api.ingest.plugins.chirps3.date") as mock_date: mock_date.today.return_value = date(2024, 4, 25) mock_date.fromisoformat = date.fromisoformat mock_date.side_effect = date - periods = plugin._build_periods("2024-03-01", "2024-03-01") + periods = plugin.periods("2024-03-01", "2024-03-01") assert periods == ["2024-03-01"] - def test_build_periods_spans_months(self) -> None: + def test_periods_spans_months(self) -> None: plugin = self._make_plugin() with patch("climate_api.ingest.plugins.chirps3.date") as mock_date: mock_date.today.return_value = date(2024, 5, 25) mock_date.fromisoformat = date.fromisoformat mock_date.side_effect = date - periods = plugin._build_periods("2024-03-30", "2024-04-02") + periods = plugin.periods("2024-03-30", "2024-04-02") assert periods == ["2024-03-30", "2024-03-31", "2024-04-01", "2024-04-02"] # probe / GridSpec - def test_probe_estimate_returns_gridspec(self) -> None: + def test_probe_returns_gridspec(self) -> None: plugin = self._make_plugin() - spec = plugin._probe_estimate([-180.0, -50.0, 180.0, 50.0]) + spec = plugin.probe([-180.0, -50.0, 180.0, 50.0]) assert isinstance(spec, GridSpec) assert spec.crs == 4326 assert spec.time_dim is True @@ -332,21 +322,12 @@ def test_probe_estimate_returns_gridspec(self) -> None: assert spec.nodata == -9999.0 assert spec.shape[0] > 0 and spec.shape[1] > 0 - def test_probe_estimate_shape_matches_chirps3_global_extent(self) -> None: + def test_probe_shape_matches_chirps3_global_extent(self) -> None: plugin = self._make_plugin() # CHIRPS3 full extent: 360° × 100° at 0.05° → 7200 × 2000 - spec = plugin._probe_estimate([-180.0, -50.0, 180.0, 50.0]) + spec = plugin.probe([-180.0, -50.0, 180.0, 50.0]) assert spec.shape == (2000, 7200) - def test_probe_is_async_and_returns_gridspec(self) -> None: - plugin = self._make_plugin() - - async def run() -> GridSpec: - return await plugin.probe([-180.0, -50.0, 180.0, 50.0]) - - spec = asyncio.run(run()) - assert isinstance(spec, GridSpec) - # fetch_period (mocked network) def _make_fake_chirps_da(self, ny: int = 4, nx: int = 5) -> Any: @@ -367,7 +348,7 @@ def test_fetch_period_returns_dataset_with_time_and_precip(self) -> None: fake_da = self._make_fake_chirps_da() with patch("rioxarray.open_rasterio", return_value=fake_da): - ds = Chirps3Plugin()._fetch_sync("2024-03-15", [-5.0, 5.0, 5.0, 10.0]) + ds = Chirps3Plugin().fetch_period("2024-03-15", [-5.0, 5.0, 5.0, 10.0]) assert "precip" in ds.data_vars assert "time" in ds.dims @@ -384,7 +365,7 @@ def test_fetch_period_masks_nodata_as_nan(self) -> None: da = da.rio.write_crs("EPSG:4326") with patch("rioxarray.open_rasterio", return_value=da): - ds = Chirps3Plugin()._fetch_sync("2024-01-01", [0.0, 1.0, 1.0, 2.0]) + ds = Chirps3Plugin().fetch_period("2024-01-01", [0.0, 1.0, 1.0, 2.0]) precip = ds["precip"].values assert np.isnan(precip).any(), "nodata pixels should be NaN" @@ -395,7 +376,7 @@ def test_fetch_period_returns_dataset_with_time_dim(self) -> None: fake_da = self._make_fake_chirps_da() with patch("rioxarray.open_rasterio", return_value=fake_da): - ds = Chirps3Plugin()._fetch_sync("2024-03-15", [-5.0, 5.0, 5.0, 10.0]) + ds = Chirps3Plugin().fetch_period("2024-03-15", [-5.0, 5.0, 5.0, 10.0]) assert "time" in ds.dims assert ds.sizes["time"] == 1 From ba36acd96fd88d4b157b1da0c9d75d65893c1e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 13:49:26 +0200 Subject: [PATCH 48/80] fix: always read committed periods from store on resume; sync plan_sync_dataset with store - Remove cursor-based pending shortcut in run_ingest: the cursor checkpoints every commit_batch_size periods so it lags behind actual Icechunk commits. Trusting it directly would re-fetch already-committed periods after a crash between an Icechunk commit and the next cursor checkpoint. Always use read_committed_period_ids() as ground truth. - Apply the same store-authoritative committed_end read in plan_sync_dataset as sync_dataset does, so the plan endpoint reflects the true on-disk state for Icechunk stores rather than potentially-stale artifact metadata. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/orchestrator.py | 20 +++++++------------- climate_api/ingestions/services.py | 8 ++++++++ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py index 59e0b463..cd08c93d 100644 --- a/climate_api/ingest/orchestrator.py +++ b/climate_api/ingest/orchestrator.py @@ -127,19 +127,13 @@ async def run_ingest( logger.info("No periods available for range %s..%s", start, end) return - # Determine pending periods: prefer cursor (fast) then fall back to store read. - cursor = load_cursor() if load_cursor else None - last_committed: str | None = cursor.get("last_committed") if cursor else None - - if last_committed and last_committed in all_periods and store_path.exists(): - idx = all_periods.index(last_committed) + 1 - pending = all_periods[idx:] - logger.info("Resuming after %s: %d/%d periods remain", last_committed, len(pending), len(all_periods)) - else: - present = read_committed_period_ids(store_path, period_type) - pending = [p for p in all_periods if p not in present] - already_done = len(all_periods) - len(pending) - logger.info("Periods: %d already committed, %d pending", already_done, len(pending)) + # Always use the store as ground truth — the job cursor is a checkpoint that + # lags behind actual Icechunk commits by up to commit_batch_size periods, so + # trusting it directly would re-fetch already-committed periods after a crash. + present = read_committed_period_ids(store_path, period_type) + pending = [p for p in all_periods if p not in present] + already_done = len(all_periods) - len(pending) + logger.info("Periods: %d already committed, %d pending", already_done, len(pending)) if not pending: logger.info("Store is current — nothing to ingest") diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index aed2bb05..32cd78f6 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -550,11 +550,19 @@ def plan_sync_dataset( source_dataset = registry_datasets.get_dataset(latest_artifact.dataset_id) if source_dataset is None: raise HTTPException(status_code=404, detail=f"Source dataset '{latest_artifact.dataset_id}' not found") + committed_end: str | None = None + if latest_artifact.format == ArtifactFormat.ICECHUNK and latest_artifact.asset_paths: + from climate_api.ingest.store import read_committed_period_ids + + period_type = str(source_dataset.get("period_type", "")) + committed = read_committed_period_ids(Path(latest_artifact.asset_paths[0]), period_type) + committed_end = max(committed) if committed else None try: return plan_sync( latest_artifact=latest_artifact, source_dataset=source_dataset, requested_end=end, + current_end=committed_end, ) except SyncConfigurationError as exc: raise HTTPException(status_code=500, detail=str(exc)) from exc From 39f2962481651790b2944217f797f56ef491c84e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 13:50:27 +0200 Subject: [PATCH 49/80] style: remove trailing whitespace from blank lines in test_datasets.py Co-Authored-By: Claude Sonnet 4.6 --- tests/test_datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index d7cee416..a256127e 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -300,7 +300,7 @@ def test_create_artifact_rejects_partial_download_scope(monkeypatch: pytest.Monk download_end="2026-02-10", bbox=[1.0, 2.0, 3.0, 4.0], overwrite=False, - + publish=False, ) @@ -327,7 +327,7 @@ def test_create_artifact_rejects_download_scope_outside_request_scope(monkeypatc download_end="2026-02-11", bbox=[1.0, 2.0, 3.0, 4.0], overwrite=False, - + publish=False, ) From 6fd343ac82bf8e4528938f85eacc1e85a1b72016 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 13:51:15 +0200 Subject: [PATCH 50/80] style: ruff format Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/plugins/chirps3.py | 1 + climate_api/ingest/plugins/worldpop.py | 1 + climate_api/ingestions/routes.py | 1 - climate_api/ingestions/services.py | 8 +-- climate_api/stac/services.py | 1 - tests/test_dataset_registry.py | 2 - tests/test_datasets.py | 6 -- tests/test_datasets_sync.py | 1 - tests/test_processing_resample.py | 1 - uv.lock | 97 -------------------------- 10 files changed, 6 insertions(+), 113 deletions(-) diff --git a/climate_api/ingest/plugins/chirps3.py b/climate_api/ingest/plugins/chirps3.py index 81d07d35..98fd860d 100644 --- a/climate_api/ingest/plugins/chirps3.py +++ b/climate_api/ingest/plugins/chirps3.py @@ -36,6 +36,7 @@ # After the 20th of a month, the previous month is considered complete _COMPLETE_AFTER_DAY = 20 + class Chirps3Plugin: """IngestionPlugin for CHIRPS v3 daily precipitation. diff --git a/climate_api/ingest/plugins/worldpop.py b/climate_api/ingest/plugins/worldpop.py index 7bc7310d..b3964376 100644 --- a/climate_api/ingest/plugins/worldpop.py +++ b/climate_api/ingest/plugins/worldpop.py @@ -27,6 +27,7 @@ # WorldPop Global2 at 100m: 3 arc-seconds = 1/1200 degree per pixel _WORLDPOP_RES_DEG = 1.0 / 1200 + class WorldPopPlugin: """IngestionPlugin for WorldPop yearly population count data. diff --git a/climate_api/ingestions/routes.py b/climate_api/ingestions/routes.py index 634df817..38d997a6 100644 --- a/climate_api/ingestions/routes.py +++ b/climate_api/ingestions/routes.py @@ -103,7 +103,6 @@ def get_dataset(dataset_id: str) -> DatasetDetailRecord: return services.get_dataset_or_404(dataset_id) - @zarr_router.api_route("/{dataset_id}", methods=["GET", "HEAD"]) def get_canonical_zarr_store_info(dataset_id: str) -> dict[str, object]: """Return canonical Zarr store listing for a managed dataset.""" diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 32cd78f6..7c1669ea 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -1135,10 +1135,10 @@ def _dataset_links(dataset_id: str, latest: ArtifactRecord) -> list[DatasetAcces DatasetAccessLink(href=f"/datasets/{dataset_id}", rel="self", title="Dataset detail"), DatasetAccessLink(href=f"/zarr/{dataset_id}", rel="zarr", title="Zarr store"), ] - is_published_store = ( - latest.publication.status == PublicationStatus.PUBLISHED - and latest.format in {ArtifactFormat.ZARR, ArtifactFormat.ICECHUNK} - ) + is_published_store = latest.publication.status == PublicationStatus.PUBLISHED and latest.format in { + ArtifactFormat.ZARR, + ArtifactFormat.ICECHUNK, + } if is_published_store: links.append(DatasetAccessLink(href=f"/stac/collections/{dataset_id}", rel="stac", title="STAC collection")) if latest.publication.pygeoapi_path is not None: diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index 0b9429fb..c2ccc673 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -319,7 +319,6 @@ def _is_pyramid_zarr(artifact_path: str) -> bool: return (Path(artifact_path) / "0").is_dir() - def _abs_url(request: Request, path: str) -> str: base_url = os.getenv("CLIMATE_API_BASE_URL") if base_url: diff --git a/tests/test_dataset_registry.py b/tests/test_dataset_registry.py index 60730caf..d9757dc4 100644 --- a/tests/test_dataset_registry.py +++ b/tests/test_dataset_registry.py @@ -136,5 +136,3 @@ def test_dataset_registry_accepts_supported_sync_execution( monkeypatch.setattr(datasets, "CONFIGS_DIR", tmp_path) assert datasets.list_datasets()[0]["sync"]["execution"] == "append" - - diff --git a/tests/test_datasets.py b/tests/test_datasets.py index a256127e..865cd447 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -209,7 +209,6 @@ def test_find_existing_artifact_ignores_record_with_overwide_coverage() -> None: records=[stale_artifact, valid_artifact], dataset_id="chirps3_precipitation_daily", request_scope=request_scope, - ) assert result == valid_artifact @@ -233,7 +232,6 @@ def test_find_existing_artifact_ignores_stale_record(monkeypatch: pytest.MonkeyP records=[stale_artifact, valid_artifact], dataset_id="chirps3_precipitation_daily", request_scope=request_scope, - ) assert result == valid_artifact @@ -300,7 +298,6 @@ def test_create_artifact_rejects_partial_download_scope(monkeypatch: pytest.Monk download_end="2026-02-10", bbox=[1.0, 2.0, 3.0, 4.0], overwrite=False, - publish=False, ) @@ -327,7 +324,6 @@ def test_create_artifact_rejects_download_scope_outside_request_scope(monkeypatc download_end="2026-02-11", bbox=[1.0, 2.0, 3.0, 4.0], overwrite=False, - publish=False, ) @@ -418,5 +414,3 @@ def fake_serve_icechunk_key(dataset_id: str, art: ArtifactRecord, relative_path: services.get_dataset_zarr_store_file_or_404("chirps3_precipitation_daily", "t2m/zarr.json") assert served_keys == ["t2m/zarr.json"] - - diff --git a/tests/test_datasets_sync.py b/tests/test_datasets_sync.py index 19019807..e52e63e7 100644 --- a/tests/test_datasets_sync.py +++ b/tests/test_datasets_sync.py @@ -770,7 +770,6 @@ def test_run_sync_raises_clear_error_when_append_invariants_are_missing(monkeypa latest_artifact=latest_artifact, source_dataset={"id": "chirps3_precipitation_daily", "period_type": "daily", "sync": {"kind": "temporal"}}, requested_end="2026-02-11", - publish=True, create_artifact_fn=lambda **_: pytest.fail("create_artifact should not be called"), get_dataset_fn=lambda _: pytest.fail("get_dataset should not be called"), diff --git a/tests/test_processing_resample.py b/tests/test_processing_resample.py index 66b18d76..fd0d33f0 100644 --- a/tests/test_processing_resample.py +++ b/tests/test_processing_resample.py @@ -183,7 +183,6 @@ def test_materialize_resampled_artifact_returns_404_when_source_dataset_template ) - def test_materialize_resampled_artifact_reads_icechunk_source( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, diff --git a/uv.lock b/uv.lock index 4dc46e43..4389eb92 100644 --- a/uv.lock +++ b/uv.lock @@ -170,40 +170,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" }, ] -[[package]] -name = "cartopy" -version = "0.25.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "matplotlib" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "pyproj" }, - { name = "pyshp" }, - { name = "shapely" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3c/3f/ec3dee34237b696a486d566a6d3ae6550ae821836e0412bafdcbbec2cfd2/cartopy-0.25.0.tar.gz", hash = "sha256:55f1a390e5f3f075b221c7d91fb10258ad978db786c7930eba06eb45d28753fe", size = 10767728, upload-time = "2025-08-01T12:44:16.573Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8e/b9/0773ff8f1c755b8a362029e6910db87064d27ca021b060c48ce511ec98b7/cartopy-0.25.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a6fcd2df8039293096f957fc9c76e969b1a9715d12ab8cee1a6bdae0c6773b8b", size = 11007728, upload-time = "2025-08-01T12:44:06.64Z" }, - { url = "https://files.pythonhosted.org/packages/34/a6/75738630b7f64bca7afc6bc5de08ddf0c61f13563f2a1abf642373d1095e/cartopy-0.25.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e4def451617e6957169447fe6ecdad0f63ef2d2007e7d451dd7b9656ada57382", size = 10996613, upload-time = "2025-08-01T12:44:08.822Z" }, - { url = "https://files.pythonhosted.org/packages/19/0d/669d4bbeb36b87ba504409d85c68ec297e6f434ea6525424f8aa5f14abac/cartopy-0.25.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c388824cb13e4fa9c2901dc4fbb2dbe9547acd2f4a6a3440983d4e6c6973ae3", size = 11829044, upload-time = "2025-08-01T12:44:11.402Z" }, - { url = "https://files.pythonhosted.org/packages/01/ff/b46e2120abd99b2ff3d376dc91ed58ae8f0a052d57c242c9b140497573dd/cartopy-0.25.0-cp313-cp313-win_amd64.whl", hash = "sha256:60bad14c072d16e3c96967638cd66eb5a62cf24bc70087bcbfc6b30a3872ed26", size = 10987060, upload-time = "2025-08-01T12:44:14.222Z" }, -] - -[[package]] -name = "cdsapi" -version = "0.7.7" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "ecmwf-datastores-client" }, - { name = "requests" }, - { name = "tqdm" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/55/f3/6cb5b4bf077c441978c5d5be3a568d37e1f07f3e7177a17fa66aec2594b6/cdsapi-0.7.7.tar.gz", hash = "sha256:bc0cf807c1b78aceba6a11c3a5180f885f47f71a4e58205e324cfedcee16f10b", size = 13322, upload-time = "2025-09-30T19:11:22.404Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4f/f4/4a65460d5cb6784128019fd707a87993f378db25e796eba01400a0903f62/cdsapi-0.7.7-py2.py3-none-any.whl", hash = "sha256:384c1658572d6dc53f4111f6dd46fcdfe6fea54a688af9756d71f6fe9118b66d", size = 12293, upload-time = "2025-09-30T19:11:21.184Z" }, -] - [[package]] name = "certifi" version = "2026.1.4" @@ -402,7 +368,6 @@ name = "climate-api" version = "0.1.0a1" source = { editable = "." } dependencies = [ - { name = "dhis2eo" }, { name = "earthkit-transforms" }, { name = "fastapi" }, { name = "geojson-pydantic" }, @@ -436,7 +401,6 @@ dev = [ [package.metadata] requires-dist = [ - { name = "dhis2eo", specifier = ">=1.2.1" }, { name = "earthkit-transforms", specifier = "==0.5.*" }, { name = "fastapi", specifier = ">=0.100.0" }, { name = "geojson-pydantic", specifier = ">=2.1.0" }, @@ -600,27 +564,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/c3/253a89ee03fc9b9682f1541728eb66db7db22148cd94f89ab22528cd1e1b/deprecation-2.1.0-py2.py3-none-any.whl", hash = "sha256:a10811591210e1fb0e768a8c25517cabeabcba6f0bf96564f8ff45189f90b14a", size = 11178, upload-time = "2020-04-20T14:23:36.581Z" }, ] -[[package]] -name = "dhis2eo" -version = "1.2.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "aiohttp" }, - { name = "earthkit-data", extra = ["cds", "geopandas", "geotiff", "projection"] }, - { name = "ecmwf-datastores-client" }, - { name = "geopandas" }, - { name = "numpy" }, - { name = "pandas" }, - { name = "requests" }, - { name = "rioxarray" }, - { name = "xarray" }, - { name = "zarr" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/e5/3d/0a2945f5459bbda30e2d014a2c02417415374953e119f7a19d58c2bdf004/dhis2eo-1.2.1.tar.gz", hash = "sha256:dfc8687f033ae36758bdbc9f78ac648c50e1badf483356d35ebbee6d00f800e3", size = 22424, upload-time = "2026-05-07T11:52:09.867Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6d/f3/c07d21f796b809ae72858d537c68fe6be7f9f97d363ba0f76adb604b8795/dhis2eo-1.2.1-py3-none-any.whl", hash = "sha256:25fc25a8225ef7b9384a5eba6df491857562b317312cb931b8af924878fa0c52", size = 29273, upload-time = "2026-05-07T11:52:08.23Z" }, -] - [[package]] name = "donfig" version = "0.8.1.post1" @@ -663,22 +606,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/56/97/098f7faf1267031c54662d52791e219f95528806a744cae37014fc21be07/earthkit_data-0.16.8-py3-none-any.whl", hash = "sha256:afa6a5cc6119756be93951a75014d96ce62eb20a5acba05553a3ddca723e1a35", size = 378372, upload-time = "2026-02-18T13:34:18.988Z" }, ] -[package.optional-dependencies] -cds = [ - { name = "cdsapi" }, -] -geopandas = [ - { name = "geopandas" }, -] -geotiff = [ - { name = "pyproj" }, - { name = "rasterio" }, - { name = "rioxarray" }, -] -projection = [ - { name = "cartopy" }, -] - [[package]] name = "earthkit-meteo" version = "0.5.1" @@ -737,21 +664,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/a3/f58ff573ba0f678ff8116686e868afe436627b19b457a2aba62cd463c9ad/eccodes-2.47.0-py3-none-any.whl", hash = "sha256:13d0b28bd58e94e2c303f42415ca0dcc56ab3febf0f52b1fb0f1d4aa5e7db8e1", size = 91567, upload-time = "2026-04-22T11:30:06.789Z" }, ] -[[package]] -name = "ecmwf-datastores-client" -version = "0.4.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "attrs" }, - { name = "multiurl" }, - { name = "requests" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/51/60/f86eb3e57baf2b1780a7046148c234e9e57b0aeb550d30f39e50991da253/ecmwf_datastores_client-0.4.2.tar.gz", hash = "sha256:7cee1f5e5dab34edcc794cd62bee02c603fafb6f4cc2121c5f012806e0f7934d", size = 48205, upload-time = "2026-01-21T15:27:31.665Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/40/2ccf4c87a5f9c8198fe71600d5f307f5dada201c091af8774a9c1e360865/ecmwf_datastores_client-0.4.2-py3-none-any.whl", hash = "sha256:d22a675b35263286de09969502ec897da9ceb9e4c8ec4d709f7ebb3b90d3ae98", size = 29092, upload-time = "2026-01-21T15:27:30.452Z" }, -] - [[package]] name = "entrypoints" version = "0.4" @@ -2297,15 +2209,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0c/82/a2c93e32800940d9573fb28c346772a14778b84ba7524e691b324620ab89/pyright-1.1.408-py3-none-any.whl", hash = "sha256:090b32865f4fdb1e0e6cd82bf5618480d48eecd2eb2e70f960982a3d9a4c17c1", size = 6399144, upload-time = "2026-01-08T08:07:37.082Z" }, ] -[[package]] -name = "pyshp" -version = "3.0.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/8c/20/8b07bae73aaa0c3f5a2683ba6e23b46e977e2d33a88126d56bbcc2d135cd/pyshp-3.0.3.tar.gz", hash = "sha256:bf4678b13dd53578ed87669676a2fffeccbcded1ec8ff9cafb36d1b660f4b305", size = 2192568, upload-time = "2025-11-28T17:47:31.616Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/82/06/cad54e8ce758bd836ee5411691cbd49efeb9cc611b374670fce299519334/pyshp-3.0.3-py3-none-any.whl", hash = "sha256:28c8fac8c0c25bb0fecbbfd10ead7f319c2ff2f3b0b44a94f22bd2c93510ad42", size = 58465, upload-time = "2025-11-28T17:47:30.328Z" }, -] - [[package]] name = "pystac" version = "1.14.3" From f30b2922897c743eb45893ffec2c4d7f0494ccb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 13:53:53 +0200 Subject: [PATCH 51/80] fix: resolve mypy errors in store, sync_engine, and services MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - store.py: cast zarr attrs value to str before .split() — attrs.get returns JSON which mypy types as int|float|Mapping|Sequence|None - sync_engine.py: plugin.periods() is now sync; remove asyncio.run() wrapper, add list[str] annotation, drop unused asyncio import - services.py: remove redundant type annotations on root/node re-assignments in the bare-path branch of get_dataset_zarr_store_info_or_404 Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/store.py | 2 +- climate_api/ingestions/services.py | 4 ++-- climate_api/ingestions/sync_engine.py | 5 +---- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/climate_api/ingest/store.py b/climate_api/ingest/store.py index afa9ab7e..3cf8af3a 100644 --- a/climate_api/ingest/store.py +++ b/climate_api/ingest/store.py @@ -117,7 +117,7 @@ def build_pyramid_store(store_path: Path, *, x_dim: str = "x", y_dim: str = "y") import xproj # noqa: F401 — registers .proj accessor root = zarr.open_group(read_session.store, mode="r") - proj_code = root.attrs.get("proj:code", "EPSG:4326") + proj_code = str(root.attrs.get("proj:code", "EPSG:4326")) epsg = int(proj_code.split(":")[1]) if ":" in proj_code else 4326 ds_loaded = ds_loaded.proj.assign_crs({"EPSG": epsg}) except Exception: diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 7c1669ea..1dcb6898 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -750,9 +750,9 @@ def _serve_icechunk_key(dataset_id: str, artifact: ArtifactRecord, relative_path is_chunk_key = "/c/" in key or key.startswith("c/") is_file_key = "." in last_segment if not is_chunk_key and not is_file_key: - root: zarr.Group = zarr.open_group(session.store, mode="r") + root = zarr.open_group(session.store, mode="r") try: - node: zarr.Group = root[key] # type: ignore[assignment] + node = root[key] # type: ignore[assignment] except KeyError: raise HTTPException(status_code=404, detail=f"Zarr path '{relative_path}' not found") children = sorted(node.keys()) diff --git a/climate_api/ingestions/sync_engine.py b/climate_api/ingestions/sync_engine.py index 6e9d7a7b..7f91cf2f 100644 --- a/climate_api/ingestions/sync_engine.py +++ b/climate_api/ingestions/sync_engine.py @@ -11,7 +11,6 @@ from __future__ import annotations -import asyncio import logging from collections.abc import Callable from datetime import date, datetime, time, timedelta @@ -368,8 +367,6 @@ def _plugin_latest_available_period( - str: the last available period in the range (may equal current_end when nothing new) - None: plugin could not be instantiated (caller falls back to legacy availability logic) - Calls asyncio.run() which requires no running event loop — plan_sync is synchronous - and FastAPI runs sync handlers in a thread pool, so this is safe. """ ingestion = source_dataset.get("ingestion") if not isinstance(ingestion, dict): @@ -394,7 +391,7 @@ def _plugin_latest_available_period( return None try: - periods = asyncio.run(plugin.periods(next_period_start, requested_end)) + periods: list[str] = plugin.periods(next_period_start, requested_end) except Exception as exc: logger.debug("plugin.periods() failed during availability check for '%s': %s", plugin_path, exc) return None From 548cd6c5362795ef97e86a0dcd98cdd1c98f834e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 13:57:49 +0200 Subject: [PATCH 52/80] =?UTF-8?q?fix:=20resolve=20pyright=20errors=20?= =?UTF-8?q?=E2=80=94=20remove=20stale=20path=3D=20field,=20suppress=20side?= =?UTF-8?q?-effect=20imports?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove path= argument from all ArtifactRecord() constructor calls; the field was deleted in this PR but call sites were not fully updated. asset_paths already carries the same value. - Add pyright: ignore[reportUnusedImport] to xproj and rioxarray side-effect imports (they are intentionally imported only to register .proj/.rio accessors; # noqa: F401 suppresses ruff but not pyright). Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/store.py | 2 +- climate_api/ingestions/services.py | 1 - tests/test_datasets.py | 2 -- tests/test_datasets_sync.py | 2 -- tests/test_ingest_plugins.py | 2 +- tests/test_publications.py | 1 - tests/test_stac.py | 1 - 7 files changed, 2 insertions(+), 9 deletions(-) diff --git a/climate_api/ingest/store.py b/climate_api/ingest/store.py index 3cf8af3a..0ddfbaea 100644 --- a/climate_api/ingest/store.py +++ b/climate_api/ingest/store.py @@ -114,7 +114,7 @@ def build_pyramid_store(store_path: Path, *, x_dim: str = "x", y_dim: str = "y") # topozarr requires xproj CRS on the dataset. Read it from the GeoZarr # root attribute written by the orchestrator (proj:code = "EPSG:"). try: - import xproj # noqa: F401 — registers .proj accessor + import xproj # noqa: F401 # pyright: ignore[reportUnusedImport] root = zarr.open_group(read_session.store, mode="r") proj_code = str(root.attrs.get("proj:code", "EPSG:4326")) diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 1dcb6898..1e11f76e 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -401,7 +401,6 @@ def _create_icechunk_artifact( dataset_name=str(dataset["name"]), variable=str(dataset["variable"]), format=ArtifactFormat.ICECHUNK, - path=str(store_path.resolve()), asset_paths=[str(store_path.resolve())], variables=[str(dataset["variable"])], request_scope=request_scope, diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 865cd447..f952b8ca 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -38,7 +38,6 @@ def _artifact( dataset_name="CHIRPS3 precipitation", variable="precip", format=ArtifactFormat.ZARR, - path="/tmp/chirps3_precipitation_daily.zarr", asset_paths=["/tmp/chirps3_precipitation_daily.zarr"], variables=["precip"], request_scope=ArtifactRequestScope( @@ -342,7 +341,6 @@ def _icechunk_artifact( dataset_name="CHIRPS3 precipitation", variable="precip", format=ArtifactFormat.ICECHUNK, - path=path, asset_paths=[path], variables=["precip"], request_scope=ArtifactRequestScope( diff --git a/tests/test_datasets_sync.py b/tests/test_datasets_sync.py index e52e63e7..bf9b7bd9 100644 --- a/tests/test_datasets_sync.py +++ b/tests/test_datasets_sync.py @@ -38,7 +38,6 @@ def _artifact( dataset_name="CHIRPS3 precipitation", variable="precip", format=ArtifactFormat.ZARR, - path=path, asset_paths=[path], variables=["precip"], request_scope=ArtifactRequestScope( @@ -795,7 +794,6 @@ def _icechunk_artifact( dataset_name="2m temperature (ERA5-Land)", variable="t2m", format=ArtifactFormat.ICECHUNK, - path=path, asset_paths=[path], variables=["t2m"], request_scope=ArtifactRequestScope( diff --git a/tests/test_ingest_plugins.py b/tests/test_ingest_plugins.py index 795cae6e..a5b8f7f4 100644 --- a/tests/test_ingest_plugins.py +++ b/tests/test_ingest_plugins.py @@ -15,7 +15,7 @@ import numpy as np import pandas as pd import pytest -import rioxarray # noqa: F401 — registers the .rio accessor used in test helpers +import rioxarray # noqa: F401 # pyright: ignore[reportUnusedImport] import xarray as xr from climate_api.ingest.protocol import GridSpec, IngestionPlugin diff --git a/tests/test_publications.py b/tests/test_publications.py index 15ea3af5..a80cea1e 100644 --- a/tests/test_publications.py +++ b/tests/test_publications.py @@ -74,7 +74,6 @@ def test_build_collection_resource_keeps_singleton_time_dimension_for_zarr( dataset_name="CHIRPS monthly total precipitation", variable="precip", format=ArtifactFormat.ZARR, - path="/tmp/chirps3_precipitation_daily_ms_sum.zarr", asset_paths=["/tmp/chirps3_precipitation_daily_ms_sum.zarr"], variables=["precip"], request_scope=ArtifactRequestScope(start="2024-01-01", end="2024-01-31"), diff --git a/tests/test_stac.py b/tests/test_stac.py index 37c491ea..0736bb55 100644 --- a/tests/test_stac.py +++ b/tests/test_stac.py @@ -49,7 +49,6 @@ def _artifact( dataset_name=dataset_name, variable=variable, format=format, - path=path, asset_paths=[path] if asset_paths is None and path is not None else (asset_paths or []), variables=[variable], request_scope=ArtifactRequestScope( From b911c47933334e20c7563ce93c803280955364ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 13:59:52 +0200 Subject: [PATCH 53/80] fix: update FakePlugin/EmptyPlugin in tests to use sync periods() The protocol refactor made probe/periods/fetch_period sync methods. Two test fake plugins in test_datasets_sync.py still had async def periods(), causing _plugin_latest_available_period to receive a coroutine instead of a list. Co-Authored-By: Claude Sonnet 4.6 --- tests/test_datasets_sync.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_datasets_sync.py b/tests/test_datasets_sync.py index bf9b7bd9..90b13537 100644 --- a/tests/test_datasets_sync.py +++ b/tests/test_datasets_sync.py @@ -602,13 +602,13 @@ class FakePlugin: commit_batch_size = 1 rechunk_time = None - async def probe(self, *_a: object, **_k: object) -> object: # type: ignore[override] + def probe(self, *_a: object, **_k: object) -> object: # type: ignore[override] raise NotImplementedError - async def periods(self, start: str, end: str) -> list[str]: + def periods(self, start: str, end: str) -> list[str]: return [d for d in ["2026-02-07", "2026-02-08", "2026-02-09"] if start <= d <= end] - async def fetch_period(self, *_a: object, **_k: object) -> object: # type: ignore[override] + def fetch_period(self, *_a: object, **_k: object) -> object: # type: ignore[override] raise NotImplementedError import climate_api.ingest.orchestrator as orch_mod @@ -636,13 +636,13 @@ class EmptyPlugin: commit_batch_size = 1 rechunk_time = None - async def probe(self, *_a: object, **_k: object) -> object: # type: ignore[override] + def probe(self, *_a: object, **_k: object) -> object: # type: ignore[override] raise NotImplementedError - async def periods(self, start: str, end: str) -> list[str]: + def periods(self, start: str, end: str) -> list[str]: return [] - async def fetch_period(self, *_a: object, **_k: object) -> object: # type: ignore[override] + def fetch_period(self, *_a: object, **_k: object) -> object: # type: ignore[override] raise NotImplementedError import climate_api.ingest.orchestrator as orch_mod From c03fb2eefc4e9d04d3e7a93881b6386690c255f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 14:14:09 +0200 Subject: [PATCH 54/80] =?UTF-8?q?fix:=20correct=20async=E2=86=92sync=20doc?= =?UTF-8?q?s;=20guard=20open=5For=5Fcreate=5Frepo=20against=20missing=20pa?= =?UTF-8?q?ths?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - orchestrator.py: update module docstring to say "sync methods" and clarify asyncio.to_thread wrapping; protocol.py: same in IngestionPlugin docstring - GridSpec docstring: clarify shape is logged but not used for chunking; the field drives GeoZarr attribute writes, not store encoding - stac/services.py + sync_engine.py: add Path.exists() check before open_or_create_repo() in read-only probe paths — the function creates a new empty Icechunk repo when the path is missing, which is an unintended side-effect during STAC generation and sync planning Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/orchestrator.py | 6 ++++-- climate_api/ingest/protocol.py | 7 ++++--- climate_api/ingestions/sync_engine.py | 23 +++++++++++++---------- climate_api/stac/services.py | 12 +++++++----- 4 files changed, 28 insertions(+), 20 deletions(-) diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py index cd08c93d..f9bd879d 100644 --- a/climate_api/ingest/orchestrator.py +++ b/climate_api/ingest/orchestrator.py @@ -1,8 +1,10 @@ """Per-period Icechunk ingest orchestrator. The orchestrator is the only place that writes to the Icechunk store. -Plugins implement three focused async methods (probe / periods / fetch_period) -and never touch zarr directly. +Plugins implement three focused sync methods (probe / periods / fetch_period) +and never touch zarr directly. The orchestrator runs probe and fetch_period +via asyncio.to_thread so I/O-bound plugins run in the thread pool without +managing their own executor; periods() is called directly (pure computation). Crash recovery: every period is committed individually. The cursor is saved every commit_batch_size periods so that a restart resumes from the last diff --git a/climate_api/ingest/protocol.py b/climate_api/ingest/protocol.py index 0433b7c6..83a94212 100644 --- a/climate_api/ingest/protocol.py +++ b/climate_api/ingest/protocol.py @@ -16,8 +16,9 @@ class GridSpec: """Source grid metadata returned by a plugin probe. - The orchestrator uses this to fix the zarr chunk shape and write GeoZarr - attributes before the first period is written. Set time_dim=False for + The orchestrator uses this to write GeoZarr attributes (CRS, bbox, dtype, + nodata) before the first period is written. Shape is logged but chunking + is not currently applied from this value. Set time_dim=False for static (time-invariant) datasets — the orchestrator branches on this flag and issues a single write with no append dimension. @@ -43,7 +44,7 @@ class IngestionPlugin(Protocol): """Minimal interface a plugin must implement for per-period Icechunk ingest. The climate-api layer owns the orchestration loop — plugins never touch - zarr or Icechunk directly. Implement the three async methods and declare + zarr or Icechunk directly. Implement the three sync methods and declare max_concurrency and commit_batch_size as class attributes. max_concurrency: maximum number of fetch_period calls in flight at once. diff --git a/climate_api/ingestions/sync_engine.py b/climate_api/ingestions/sync_engine.py index 7f91cf2f..f8e72e2c 100644 --- a/climate_api/ingestions/sync_engine.py +++ b/climate_api/ingestions/sync_engine.py @@ -424,17 +424,20 @@ def _supports_append(source_dataset: dict[str, Any], latest_artifact: ArtifactRe try: import zarr - repo = open_or_create_repo(Path(latest_artifact.asset_paths[0])) - session = repo.readonly_session("main") - root = zarr.open_group(session.store, mode="r") - if "multiscales" in root.attrs: - logger.warning( - "Sync append is not supported for pyramid Icechunk dataset '%s'; falling back to rematerialize", - source_dataset.get("id", ""), - ) - return False + icechunk_path = Path(latest_artifact.asset_paths[0]) + if icechunk_path.exists(): + repo = open_or_create_repo(icechunk_path) + session = repo.readonly_session("main") + root = zarr.open_group(session.store, mode="r") + if "multiscales" in root.attrs: + logger.warning( + "Sync append not supported for pyramid Icechunk dataset '%s'; " + "falling back to rematerialize", + source_dataset.get("id", ""), + ) + return False except Exception: - pass # store missing or unreadable — let the ingest path handle it + pass # store unreadable — let the ingest path handle it return True if source_dataset.get("sync", {}).get("execution") != SyncAction.APPEND.value: diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index c2ccc673..122f1b1a 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -453,11 +453,13 @@ def _zarr_open_kwargs(artifact: ArtifactRecord) -> dict[str, object]: from climate_api.ingest.store import open_or_create_repo - repo = open_or_create_repo(Path(artifact_path)) - session = repo.readonly_session("main") - root = zarr.open_group(session.store, mode="r") - if "multiscales" in root.attrs: - kwargs["group"] = "0" + icechunk_path = Path(artifact_path) + if icechunk_path.exists(): + repo = open_or_create_repo(icechunk_path) + session = repo.readonly_session("main") + root = zarr.open_group(session.store, mode="r") + if "multiscales" in root.attrs: + kwargs["group"] = "0" except Exception: pass return kwargs From 981966a756d8675a0c4e26d4183e7dea7b76c1ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 14:25:40 +0200 Subject: [PATCH 55/80] =?UTF-8?q?fix:=20address=20review=20findings=20?= =?UTF-8?q?=E2=80=94=20remove=20load=5Fcursor,=20fix=20ERA5=20probe,=20cla?= =?UTF-8?q?rify=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove load_cursor from run_ingest/run_ingest_sync/create_artifact/ execute_ingest: resume is always store-based (read_committed_period_ids); the parameter accepted a value but never called it - Collapse already_done/done_offset duplicate in orchestrator.py - ERA5LandPlugin.probe: compute shape from known 0.1° resolution (no network I/O); update _select_bbox to use the constant instead of lazy dask compute. Resolves "metadata-only probe" docstring claim and x_dim/y_dim mismatch - protocol.py: periods() docstring — must be pure computation, no I/O; rechunk_time docstring — clarify omitting the attribute is valid - WorldPopPlugin: document full-rebuild-on-sync and memory bound in docstring - Add test_run_ingest_preserves_committed_periods_on_fetch_error: verifies periods committed before a mid-run fetch exception are retained in the store - Add asyncio.run() event-loop precondition note to run_ingest_sync docstring Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/orchestrator.py | 23 +++++----- climate_api/ingest/plugins/era5_land.py | 24 +++++----- climate_api/ingest/plugins/worldpop.py | 8 ++++ climate_api/ingest/protocol.py | 16 ++++--- climate_api/ingestions/execution.py | 4 +- climate_api/ingestions/services.py | 4 -- tests/test_ingest_orchestrator.py | 59 +++++++++++++++++++++---- 7 files changed, 93 insertions(+), 45 deletions(-) diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py index f9bd879d..4d2315f2 100644 --- a/climate_api/ingest/orchestrator.py +++ b/climate_api/ingest/orchestrator.py @@ -106,16 +106,14 @@ async def run_ingest( on_progress: Callable[..., None] | None = None, is_cancel_requested: Callable[[], bool] | None = None, save_cursor: Callable[[dict[str, Any]], None] | None = None, - load_cursor: Callable[[], dict[str, Any] | None] | None = None, rechunk_time: int | None = None, apply_transforms: Callable[[xr.Dataset], xr.Dataset] | None = None, pyramid: bool = False, ) -> None: """Probe the source then stream per-period data into an Icechunk store. - On the first run creates the store. On resume continues from the last - committed period recorded in the job cursor (falling back to reading the - store's committed time coordinates when no cursor is present). + On the first run creates the store. On resume, reads committed period IDs + directly from the Icechunk store and ingests only the missing periods. Memory usage is bounded by plugin.max_concurrency datasets held in flight concurrently. Writes are always sequential: tasks are awaited in @@ -141,14 +139,13 @@ async def run_ingest( logger.info("Store is current — nothing to ingest") return - done_offset = len(all_periods) - len(pending) if on_progress: - on_progress(done=done_offset, total=len(all_periods), message=f"{len(pending)} periods pending") + on_progress(done=already_done, total=len(all_periods), message=f"{len(pending)} periods pending") # True when no periods have been committed yet — handles both a brand-new # store and a store directory that exists as an empty skeleton from a # previous failed initialisation (where append_dim would fail on an empty store). - is_first_write = done_offset == 0 + is_first_write = already_done == 0 # Capture before any commits so expire_snapshots only marks snapshots that # were created during this run, not the pre-existing HEAD. ingest_started_at = datetime.now(tz=timezone.utc) @@ -205,7 +202,7 @@ async def _fetch(period_id: str) -> xr.Dataset: logger.debug("Committed: %s (%d/%d)", period_id, i + 1, len(pending)) if on_progress: - on_progress(done=done_offset + i + 1, total=len(all_periods), message=f"Wrote {period_id}") + on_progress(done=already_done + i + 1, total=len(all_periods), message=f"Wrote {period_id}") if not spec.time_dim: for t in tasks[i + 1 :]: @@ -252,12 +249,17 @@ def run_ingest_sync( on_progress: Callable[..., None] | None = None, is_cancel_requested: Callable[[], bool] | None = None, save_cursor: Callable[[dict[str, Any]], None] | None = None, - load_cursor: Callable[[], dict[str, Any] | None] | None = None, rechunk_time: int | None = None, apply_transforms: Callable[[xr.Dataset], xr.Dataset] | None = None, pyramid: bool = False, ) -> None: - """Synchronous wrapper around run_ingest for use in threaded job workers.""" + """Synchronous wrapper around run_ingest for use in threaded job workers. + + Must be called from a thread with no running event loop (e.g. a FastAPI + background task dispatched via the job framework's thread pool). + asyncio.run() creates a new event loop and will raise RuntimeError if one + is already running in the calling thread. + """ asyncio.run( run_ingest( plugin=plugin, @@ -270,7 +272,6 @@ def run_ingest_sync( on_progress=on_progress, is_cancel_requested=is_cancel_requested, save_cursor=save_cursor, - load_cursor=load_cursor, rechunk_time=rechunk_time, apply_transforms=apply_transforms, pyramid=pyramid, diff --git a/climate_api/ingest/plugins/era5_land.py b/climate_api/ingest/plugins/era5_land.py index 3051d1e5..eff2b835 100644 --- a/climate_api/ingest/plugins/era5_land.py +++ b/climate_api/ingest/plugins/era5_land.py @@ -11,6 +11,7 @@ from __future__ import annotations import logging +import math from datetime import date, timedelta from typing import Any @@ -24,6 +25,8 @@ _DESTINE_ZARR_URL = "https://data.earthdatahub.destine.eu/era5/reanalysis-era5-land-no-antartica-v0.zarr" _STORAGE_OPTIONS = {"client_kwargs": {"trust_env": True}} +# ERA5-Land native resolution: 0.1° × 0.1° (~9 km at equator). +_ERA5_LAND_RES_DEG = 0.1 # ERA5-Land on DestinE has roughly a 15-day publication lag. _LAG_DAYS = 15 @@ -47,17 +50,15 @@ def __init__(self, variable: str) -> None: # ------------------------------------------------------------------ def probe(self, bbox: list[float], **_: Any) -> GridSpec: - """Open the remote zarr metadata-only and return the grid spec for bbox.""" - ds = self._open_remote() - ds = self._correct_longitude(ds) - ds = self._select_bbox(ds, bbox) - da = ds[self.variable] - ny = da.sizes["latitude"] - nx = da.sizes["longitude"] + """Derive GridSpec from ERA5-Land's known 0.1° resolution — no data transfer.""" + xmin, ymin, xmax, ymax = map(float, bbox) + # _select_bbox pads by one pixel in each direction so probe matches fetch shape. + nx = max(1, math.ceil((xmax - xmin + 2 * _ERA5_LAND_RES_DEG) / _ERA5_LAND_RES_DEG)) + ny = max(1, math.ceil((ymax - ymin + 2 * _ERA5_LAND_RES_DEG) / _ERA5_LAND_RES_DEG)) return GridSpec( shape=(ny, nx), crs=4326, - dtype=np.dtype(da.dtype), + dtype=np.dtype("float32"), nodata=None, time_dim=True, x_dim="x", @@ -107,9 +108,8 @@ def _correct_longitude(self, ds: xr.Dataset) -> xr.Dataset: def _select_bbox(self, ds: xr.Dataset, bbox: list[float]) -> xr.Dataset: xmin, ymin, xmax, ymax = map(float, bbox) - lon_res = float(abs(ds.longitude.diff("longitude").median())) - lat_res = float(abs(ds.latitude.diff("latitude").median())) + pad = _ERA5_LAND_RES_DEG return ds.sel( - longitude=slice(xmin - lon_res, xmax + lon_res), - latitude=slice(ymax + lat_res, ymin - lat_res), + longitude=slice(xmin - pad, xmax + pad), + latitude=slice(ymax + pad, ymin - pad), ) diff --git a/climate_api/ingest/plugins/worldpop.py b/climate_api/ingest/plugins/worldpop.py index b3964376..1780fdac 100644 --- a/climate_api/ingest/plugins/worldpop.py +++ b/climate_api/ingest/plugins/worldpop.py @@ -37,6 +37,14 @@ class WorldPopPlugin: upper-case for directory names, lower-case for filenames). version: Dataset version — 'global2' (2015–2030, default) or 'global1' (2000–2020). + + Each country GeoTIFF is downloaded in full (~100–500 MB) and clipped to + the bbox in memory. max_concurrency=1 is required to bound peak memory. + + Sync behaviour: WorldPop stores are pyramid stores (pyramid=True). The + orchestrator cannot append to pyramid stores, so each sync triggers a + full rematerialization — all years are re-fetched and the pyramid is + rebuilt from scratch. This is expected and intentional. """ max_concurrency = 1 diff --git a/climate_api/ingest/protocol.py b/climate_api/ingest/protocol.py index 83a94212..fe47b7df 100644 --- a/climate_api/ingest/protocol.py +++ b/climate_api/ingest/protocol.py @@ -61,11 +61,11 @@ class IngestionPlugin(Protocol): rechunk_time (optional class attribute): target time chunk size for the post-ingest rechunk. When set, the orchestrator rewrites the store after all periods are committed so the time axis uses chunks of this size - instead of the per-period chunk-of-1. Declare as a class attribute - (``rechunk_time: int | None = None``) to skip rechunking, or set to a - positive int (30 for daily, 720 for hourly). This attribute is read via - ``getattr`` and is intentionally excluded from the Protocol so that - plugins that omit it still pass the ``isinstance`` check. + instead of the per-period chunk-of-1. Set to a positive int (30 for + daily, 720 for hourly) to enable rechunking. Omitting the attribute + entirely is equivalent to ``None`` — the orchestrator uses + ``getattr(plugin, "rechunk_time", None)`` so plugins that omit it still + pass the ``isinstance`` check. pyramid (optional class attribute): when ``True``, the orchestrator builds a multiscale pyramid after ingest completes. Level count is derived @@ -85,8 +85,10 @@ def probe(self, bbox: list[float], **params: Any) -> GridSpec: def periods(self, start: str, end: str) -> list[str]: """Return the ordered list of available period IDs from start to end. - May query the upstream source to confirm which periods are published. - The orchestrator uses the length of this list for progress reporting. + Must be pure computation — no I/O. The orchestrator calls periods() + directly (not via asyncio.to_thread), so blocking here stalls the event + loop. Apply any availability cutoff inside this method using today's date + and a fixed lag constant rather than querying the upstream source. Use enumerate_periods() as a helper for standard daily/hourly/yearly types. """ ... diff --git a/climate_api/ingestions/execution.py b/climate_api/ingestions/execution.py index 2c3592be..45bbac79 100644 --- a/climate_api/ingestions/execution.py +++ b/climate_api/ingestions/execution.py @@ -21,12 +21,11 @@ def execute_ingest( on_progress: Any | None = None, is_cancel_requested: Any | None = None, save_cursor: Any | None = None, - load_cursor: Any | None = None, ) -> dict[str, Any]: """Ingest one dataset for the configured extent and return a result summary. Accepts optional job-framework callbacks (on_progress, is_cancel_requested, - save_cursor, load_cursor) so that progress is visible when run as an async job. + save_cursor) so that progress is visible when run as an async job. """ dataset = registry_datasets.get_dataset(dataset_id) if dataset is None: @@ -43,7 +42,6 @@ def execute_ingest( on_progress=on_progress, is_cancel_requested=is_cancel_requested, save_cursor=save_cursor, - load_cursor=load_cursor, ) return { "status": "completed", diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 1e11f76e..355e095f 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -203,7 +203,6 @@ def create_artifact( on_progress: Any | None = None, is_cancel_requested: Any | None = None, save_cursor: Any | None = None, - load_cursor: Any | None = None, ) -> ArtifactRecord: """Ingest a dataset via its plugin, persist it locally, and store artifact metadata.""" period_type = str(dataset["period_type"]) @@ -255,7 +254,6 @@ def create_artifact( on_progress=on_progress, is_cancel_requested=is_cancel_requested, save_cursor=save_cursor, - load_cursor=load_cursor, ) @@ -272,7 +270,6 @@ def _create_icechunk_artifact( on_progress: Any | None = None, is_cancel_requested: Any | None = None, save_cursor: Any | None = None, - load_cursor: Any | None = None, ) -> ArtifactRecord: """Run per-period Icechunk ingest and register the resulting store as an artifact. @@ -346,7 +343,6 @@ def _create_icechunk_artifact( on_progress=on_progress, is_cancel_requested=is_cancel_requested, save_cursor=save_cursor, - load_cursor=load_cursor, ) if not store_path.exists(): diff --git a/tests/test_ingest_orchestrator.py b/tests/test_ingest_orchestrator.py index b676bb40..2baeede4 100644 --- a/tests/test_ingest_orchestrator.py +++ b/tests/test_ingest_orchestrator.py @@ -139,12 +139,11 @@ def test_run_ingest_is_idempotent(tmp_path: Path) -> None: assert committed == {"2024-01", "2024-02"} -def test_run_ingest_resumes_from_cursor(tmp_path: Path) -> None: - """Simulate a crash after the first batch (2024-01, 2024-02) and resume.""" +def test_run_ingest_resumes_from_store(tmp_path: Path) -> None: + """A second run reads committed period IDs from the store and only fetches missing ones.""" plugin = FakePlugin(["2024-01", "2024-02", "2024-03", "2024-04"]) store_path = tmp_path / "test.icechunk" - # First run writes all four periods but we stop with a cursor pointing at batch 1. asyncio.run( run_ingest( plugin=plugin, @@ -158,10 +157,7 @@ def test_run_ingest_resumes_from_cursor(tmp_path: Path) -> None: ) assert read_committed_period_ids(store_path, "monthly") == {"2024-01", "2024-02"} - # Resume: provide a cursor pointing at the last committed period. - cursor: dict[str, Any] = {"last_committed": "2024-02"} plugin2 = FakePlugin(["2024-01", "2024-02", "2024-03", "2024-04"]) - asyncio.run( run_ingest( plugin=plugin2, @@ -171,11 +167,10 @@ def test_run_ingest_resumes_from_cursor(tmp_path: Path) -> None: end="2024-04", store_path=store_path, period_type="monthly", - load_cursor=lambda: cursor, ) ) - # Only the two new periods were fetched. + # Only the two new periods were fetched; pre-existing ones were skipped. assert sorted(plugin2.fetched) == ["2024-03", "2024-04"] committed = read_committed_period_ids(store_path, "monthly") assert committed == {"2024-01", "2024-02", "2024-03", "2024-04"} @@ -281,6 +276,54 @@ def cancel_after_two() -> bool: ) +def test_run_ingest_preserves_committed_periods_on_fetch_error(tmp_path: Path) -> None: + """Periods committed before a fetch_period exception are retained in the store.""" + store_path = tmp_path / "test.icechunk" + + # Ingest 2024-01 and 2024-02 successfully. + seed_plugin = FakePlugin(["2024-01", "2024-02"]) + asyncio.run( + run_ingest( + plugin=seed_plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-02", + store_path=store_path, + period_type="monthly", + ) + ) + + # Now try to extend with 2024-03 through 2024-05; 2024-04 will raise. + class FailOnPeriod(FakePlugin): + def fetch_period(self, period_id: str, bbox: list[float], **params: Any) -> xr.Dataset: + if period_id == "2024-04": + raise RuntimeError("simulated fetch failure") + return super().fetch_period(period_id, bbox, **params) + + failing_plugin = FailOnPeriod(["2024-01", "2024-02", "2024-03", "2024-04", "2024-05"]) + with pytest.raises(RuntimeError, match="simulated fetch failure"): + asyncio.run( + run_ingest( + plugin=failing_plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-05", + store_path=store_path, + period_type="monthly", + ) + ) + + # The store must be valid: pre-existing + 2024-03 committed; 2024-04 and 2024-05 not. + committed = read_committed_period_ids(store_path, "monthly") + assert "2024-01" in committed + assert "2024-02" in committed + assert "2024-03" in committed + assert "2024-04" not in committed + assert "2024-05" not in committed + + # --------------------------------------------------------------------------- # Sync wrapper # --------------------------------------------------------------------------- From eac91cfc826f61c9b085ed9b8b52df4fbf5bbb47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 15:20:47 +0200 Subject: [PATCH 56/80] fix: expose format as "zarr" in ZarrListing responses Icechunk is an internal storage detail; the /zarr/ HTTP endpoint always serves standard Zarr to clients regardless of backing format. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingestions/services.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 355e095f..3bf8b6a7 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -581,7 +581,7 @@ def get_dataset_zarr_store_info_or_404(dataset_id: str) -> dict[str, object]: return { "kind": "ZarrListing", "dataset_id": dataset_id, - "format": artifact.format, + "format": "zarr", "path": ".", "crs": crs, "proj4": _crs_to_proj4(crs), @@ -620,7 +620,7 @@ def _icechunk_store_info(dataset_id: str, artifact: ArtifactRecord) -> dict[str, return { "kind": "ZarrListing", "dataset_id": dataset_id, - "format": artifact.format, + "format": "zarr", "path": ".", "crs": crs, "proj4": _crs_to_proj4(crs), From f9c004a4beeac478eecfafc0544bfb8d912a872b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 15:31:14 +0200 Subject: [PATCH 57/80] feat: live progress tracking in manage UI (ingest and sync) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both Ingest and Sync now stream SSE progress events via fetch() instead of blocking until redirect. The manage page shows a progress bar and period counter (N / total) while the job runs, then navigates on completion or surfaces errors inline without a page reload. Backend: on_progress threaded through sync_dataset → run_sync → create_artifact_fn so both code paths emit progress events. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingestions/services.py | 2 + climate_api/ingestions/sync_engine.py | 2 + climate_api/system/routes.py | 155 +++++++++++++++++--------- climate_api/templates/manage.html | 140 ++++++++++++++++++++++- 4 files changed, 243 insertions(+), 56 deletions(-) diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 3bf8b6a7..68e21337 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -490,6 +490,7 @@ def sync_dataset( dataset_id: str, end: str | None, publish: bool, + on_progress: Any | None = None, ) -> SyncResponse: """Resolve sync inputs and delegate managed-dataset sync to the sync engine. @@ -528,6 +529,7 @@ def sync_dataset( create_artifact_fn=create_artifact, get_dataset_fn=get_dataset_or_404, current_end=committed_end, + on_progress=on_progress, ) except SyncConfigurationError as exc: raise HTTPException(status_code=500, detail=str(exc)) from exc diff --git a/climate_api/ingestions/sync_engine.py b/climate_api/ingestions/sync_engine.py index f8e72e2c..b084da2a 100644 --- a/climate_api/ingestions/sync_engine.py +++ b/climate_api/ingestions/sync_engine.py @@ -171,6 +171,7 @@ def run_sync( create_artifact_fn: Callable[..., ArtifactRecord], get_dataset_fn: Callable[[str], Any], current_end: str | None = None, + on_progress: Any | None = None, ) -> SyncResponse: """Plan and execute one sync operation for a managed dataset. @@ -248,6 +249,7 @@ def run_sync( bbox=list(latest_artifact.request_scope.bbox) if latest_artifact.request_scope.bbox is not None else None, overwrite=False, publish=publish, + on_progress=on_progress, ) logger.info( "Sync completed for dataset '%s': artifact_id=%s action=%s", diff --git a/climate_api/system/routes.py b/climate_api/system/routes.py index a0fd5439..5f90219f 100644 --- a/climate_api/system/routes.py +++ b/climate_api/system/routes.py @@ -1,11 +1,14 @@ """Root API endpoints.""" +import asyncio +import json import sys import urllib.parse from importlib.metadata import version as _pkg_version +from typing import Any, AsyncGenerator from fastapi import APIRouter, Request -from fastapi.responses import HTMLResponse, JSONResponse, Response +from fastapi.responses import HTMLResponse, JSONResponse, Response, StreamingResponse from starlette.responses import RedirectResponse from .schemas import AppInfo, HealthStatus, Status @@ -14,6 +17,18 @@ router = APIRouter() +async def _sse_stream( + task: asyncio.Task[None], + queue: asyncio.Queue[dict[str, Any] | None], +) -> AsyncGenerator[str, None]: + """Yield SSE events from queue until the task sentinel (None) arrives.""" + while True: + event = await queue.get() + if event is None: + break + yield f"data: {json.dumps(event)}\n\n" + + @router.get("/", response_class=Response, responses=ROOT_RESPONSES) def read_index(request: Request) -> Response: """Return the landing page (HTML) or a navigation object (JSON with ?f=json).""" @@ -42,70 +57,104 @@ def manage( @router.post("/manage/ingest", include_in_schema=False) -async def manage_ingest(request: Request) -> RedirectResponse: - """Handle ingest form submission and redirect to the management page.""" - from fastapi import HTTPException - +async def manage_ingest(request: Request) -> Response: + """Stream ingest progress as SSE, then signal redirect on completion.""" from climate_api.data_registry.services.datasets import get_dataset from climate_api.extents.services import get_extent_or_404 from climate_api.ingestions.services import create_artifact base = str(request.base_url).rstrip("/") - try: - form = await request.form() - dataset_id = str(form.get("dataset_id", "")) - start = str(form.get("start", "")) - end = str(form.get("end", "")) or None - publish = "publish" in form - overwrite = "overwrite" in form - - template = get_dataset(dataset_id) - if template is None: - msg = urllib.parse.quote(f"Dataset template '{dataset_id}' not found") - return RedirectResponse(f"{base}/manage?error={msg}", status_code=303) - - extent = get_extent_or_404() - resolved_bbox = list(extent["bbox"]) - - create_artifact( - dataset=template, - start=start, - end=end, - bbox=resolved_bbox, - overwrite=overwrite, - publish=publish, - ) - name = urllib.parse.quote(template.get("name", dataset_id)) - return RedirectResponse(f"{base}/manage?message=Ingested+{name}", status_code=303) - except HTTPException as exc: - msg = urllib.parse.quote(str(exc.detail)) - return RedirectResponse(f"{base}/manage?error={msg}", status_code=303) - except Exception as exc: - msg = urllib.parse.quote(str(exc)) + form = await request.form() + dataset_id = str(form.get("dataset_id", "")) + start = str(form.get("start", "")) + end = str(form.get("end", "")) or None + publish = "publish" in form + overwrite = "overwrite" in form + + template = get_dataset(dataset_id) + if template is None: + msg = urllib.parse.quote(f"Dataset template '{dataset_id}' not found") return RedirectResponse(f"{base}/manage?error={msg}", status_code=303) + extent = get_extent_or_404() + resolved_bbox = list(extent["bbox"]) + + queue: asyncio.Queue[dict[str, Any] | None] = asyncio.Queue() + loop = asyncio.get_event_loop() + error_holder: list[str] = [] + + def on_progress(done: int, total: int, message: str = "") -> None: + loop.call_soon_threadsafe(queue.put_nowait, {"done": done, "total": total, "message": message}) + + async def run() -> None: + try: + await asyncio.to_thread( + create_artifact, + dataset=template, + start=start, + end=end, + bbox=resolved_bbox, + overwrite=overwrite, + publish=publish, + on_progress=on_progress, + ) + except Exception as exc: + error_holder.append(str(exc)) + finally: + await queue.put(None) + + task = asyncio.create_task(run()) + + async def event_stream() -> AsyncGenerator[str, None]: + async for chunk in _sse_stream(task, queue): + yield chunk + if error_holder: + yield f"data: {json.dumps({'error': error_holder[0]})}\n\n" + else: + name = urllib.parse.quote(str(template.get("name", dataset_id))) + yield f"data: {json.dumps({'redirect': f'{base}/manage?message=Ingested+{name}'})}\n\n" + + return StreamingResponse(event_stream(), media_type="text/event-stream") -@router.post("/manage/sync", include_in_schema=False) -async def manage_sync(request: Request) -> RedirectResponse: - """Handle sync form submission and redirect to the management page.""" - from fastapi import HTTPException +@router.post("/manage/sync", include_in_schema=False) +async def manage_sync(request: Request) -> Response: + """Stream sync progress as SSE, then signal redirect on completion.""" from climate_api.ingestions.services import sync_dataset base = str(request.base_url).rstrip("/") - try: - form = await request.form() - dataset_id = str(form.get("dataset_id", "")) - publish = "publish" in form - - sync_dataset(dataset_id=dataset_id, end=None, publish=publish) - return RedirectResponse(f"{base}/manage?message=Sync+completed", status_code=303) - except HTTPException as exc: - msg = urllib.parse.quote(str(exc.detail)) - return RedirectResponse(f"{base}/manage?error={msg}", status_code=303) - except Exception as exc: - msg = urllib.parse.quote(str(exc)) - return RedirectResponse(f"{base}/manage?error={msg}", status_code=303) + form = await request.form() + dataset_id = str(form.get("dataset_id", "")) + publish = "publish" in form + + queue: asyncio.Queue[dict[str, Any] | None] = asyncio.Queue() + loop = asyncio.get_event_loop() + error_holder: list[str] = [] + + def on_progress(done: int, total: int, message: str = "") -> None: + loop.call_soon_threadsafe(queue.put_nowait, {"done": done, "total": total, "message": message}) + + async def run() -> None: + try: + await asyncio.to_thread( + sync_dataset, dataset_id=dataset_id, end=None, publish=publish, on_progress=on_progress + ) + except Exception as exc: + error_holder.append(str(exc)) + finally: + await queue.put(None) + + task = asyncio.create_task(run()) + + async def event_stream() -> AsyncGenerator[str, None]: + async for chunk in _sse_stream(task, queue): + yield chunk + if error_holder: + yield f"data: {json.dumps({'error': error_holder[0]})}\n\n" + else: + yield f"data: {json.dumps({'redirect': f'{base}/manage?message=Sync+completed'})}\n\n" + + return StreamingResponse(event_stream(), media_type="text/event-stream") @router.get("/health") diff --git a/climate_api/templates/manage.html b/climate_api/templates/manage.html index 2986ba9b..5c23939e 100644 --- a/climate_api/templates/manage.html +++ b/climate_api/templates/manage.html @@ -255,6 +255,41 @@ background: #f1f5f9; border-color: #94a3b8; } + .sync-btn:disabled { + opacity: 0.5; + cursor: not-allowed; + } + + .progress-area { + margin-top: 1rem; + display: none; + } + .progress-area.active { + display: block; + } + .progress-bar-track { + background: #e2e8f0; + border-radius: 99px; + height: 6px; + overflow: hidden; + margin-bottom: 0.4rem; + } + .progress-bar-fill { + background: #0066cc; + height: 100%; + width: 0%; + border-radius: 99px; + transition: width 0.3s ease; + } + .progress-text { + font-size: 0.78rem; + color: #64748b; + } + + button[type="submit"]:disabled { + opacity: 0.6; + cursor: not-allowed; + } .divider { border: none; @@ -361,8 +396,12 @@

Ingest dataset


- - Ingestion runs synchronously and may take several minutes. + + This may take several minutes. +
+
+
+
{% endif %} @@ -401,8 +440,12 @@

Ingested datasets {{ datasets | length }}

- +
+
+
+
+
{% endfor %} @@ -419,5 +462,96 @@

Ingested datasets {{ datasets | length }}

+ + From a7bc5b7eba74f0843ab65aadb445f7c2cae4014e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 15:32:44 +0200 Subject: [PATCH 58/80] fix: remove extent field from ingest form (single-extent instances) Co-Authored-By: Claude Sonnet 4.6 --- climate_api/templates/manage.html | 9 --------- 1 file changed, 9 deletions(-) diff --git a/climate_api/templates/manage.html b/climate_api/templates/manage.html index 5c23939e..d5acc78b 100644 --- a/climate_api/templates/manage.html +++ b/climate_api/templates/manage.html @@ -373,15 +373,6 @@

Ingest dataset

/> -
- - -
-
From 1e4a250aafc3b9ee5f7489eea1e5a69ecefdee99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 15:35:56 +0200 Subject: [PATCH 59/80] fix: add SSE headers and keepalive to unblock browser buffering Without Cache-Control: no-cache and a periodic keepalive comment, uvicorn/nginx buffer the text/event-stream response so no events reach the browser until the connection closes. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/system/routes.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/climate_api/system/routes.py b/climate_api/system/routes.py index 5f90219f..fa7f8992 100644 --- a/climate_api/system/routes.py +++ b/climate_api/system/routes.py @@ -17,13 +17,28 @@ router = APIRouter() +_SSE_HEADERS = { + "Cache-Control": "no-cache", + "X-Accel-Buffering": "no", + "Connection": "keep-alive", +} + + async def _sse_stream( task: asyncio.Task[None], queue: asyncio.Queue[dict[str, Any] | None], ) -> AsyncGenerator[str, None]: - """Yield SSE events from queue until the task sentinel (None) arrives.""" + """Yield SSE events from queue until the task sentinel (None) arrives. + + Sends a keepalive comment every 5 seconds so the connection is not + dropped and partial response buffers are flushed by the browser. + """ while True: - event = await queue.get() + try: + event = await asyncio.wait_for(queue.get(), timeout=5.0) + except asyncio.TimeoutError: + yield ": keepalive\n\n" + continue if event is None: break yield f"data: {json.dumps(event)}\n\n" @@ -114,7 +129,7 @@ async def event_stream() -> AsyncGenerator[str, None]: name = urllib.parse.quote(str(template.get("name", dataset_id))) yield f"data: {json.dumps({'redirect': f'{base}/manage?message=Ingested+{name}'})}\n\n" - return StreamingResponse(event_stream(), media_type="text/event-stream") + return StreamingResponse(event_stream(), media_type="text/event-stream", headers=_SSE_HEADERS) @router.post("/manage/sync", include_in_schema=False) @@ -154,7 +169,7 @@ async def event_stream() -> AsyncGenerator[str, None]: else: yield f"data: {json.dumps({'redirect': f'{base}/manage?message=Sync+completed'})}\n\n" - return StreamingResponse(event_stream(), media_type="text/event-stream") + return StreamingResponse(event_stream(), media_type="text/event-stream", headers=_SSE_HEADERS) @router.get("/health") From c03ca9506da111e2242f1a5847780f80fd826a69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 15:45:59 +0200 Subject: [PATCH 60/80] perf: cache ERA5-Land remote store on plugin instance Opening the DestinE zarr store (auth handshake + metadata fetch) on every fetch_period call made each of 145+ periods pay the full connection overhead. Cache the opened and longitude-corrected dataset on the instance; subsequent calls reuse it. Double-checked lock guards against concurrent opens when max_concurrency=4. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/plugins/era5_land.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/climate_api/ingest/plugins/era5_land.py b/climate_api/ingest/plugins/era5_land.py index eff2b835..ce8ffc2f 100644 --- a/climate_api/ingest/plugins/era5_land.py +++ b/climate_api/ingest/plugins/era5_land.py @@ -12,6 +12,7 @@ import logging import math +import threading from datetime import date, timedelta from typing import Any @@ -44,6 +45,8 @@ class Era5LandPlugin: def __init__(self, variable: str) -> None: self.variable = variable + self._cache_ds: xr.Dataset | None = None + self._cache_lock = threading.Lock() # ------------------------------------------------------------------ # Protocol implementation @@ -75,10 +78,13 @@ def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Datase hour = int(period_id[-2:]) if len(period_id) > 10 else 0 date_part = period_id[:10] - ds = self._open_remote() - ds = self._correct_longitude(ds) - ds = self._select_bbox(ds, bbox) - ds = ds.sel(valid_time=f"{date_part}T{hour:02d}") + if self._cache_ds is None: + with self._cache_lock: + if self._cache_ds is None: + logger.info("Opening ERA5-Land remote store: %s", _DESTINE_ZARR_URL) + self._cache_ds = self._correct_longitude(self._open_remote()) + ds = self._cache_ds + ds = self._select_bbox(ds, bbox).sel(valid_time=f"{date_part}T{hour:02d}") # Ensure a length-1 time dimension so append_dim="time" works correctly. if "valid_time" in ds.dims: From 9d99dc0d4b97ea35aab40ba38be96b74bf83f8b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 15:54:42 +0200 Subject: [PATCH 61/80] fix: add python-multipart as explicit dependency Required by FastAPI for form parsing in the manage routes (await request.form()). Was missing from pyproject.toml, causing install failures in downstream projects. Co-Authored-By: Claude Sonnet 4.6 --- pyproject.toml | 1 + uv.lock | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 0b7fc467..1e1cd40c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ dependencies = [ "rioxarray>=0.17", "portalocker>=3.2.0", "icechunk>=2.0,<3", + "python-multipart>=0.0.29", ] [project.urls] diff --git a/uv.lock b/uv.lock index 4389eb92..186eecab 100644 --- a/uv.lock +++ b/uv.lock @@ -380,6 +380,7 @@ dependencies = [ { name = "pygeoapi" }, { name = "pystac" }, { name = "python-dotenv" }, + { name = "python-multipart" }, { name = "rioxarray" }, { name = "starlette" }, { name = "topozarr" }, @@ -413,6 +414,7 @@ requires-dist = [ { name = "pygeoapi", specifier = ">=0.22.0" }, { name = "pystac", specifier = ">=1.10,<2" }, { name = "python-dotenv", specifier = ">=1.0.1" }, + { name = "python-multipart", specifier = ">=0.0.29" }, { name = "rioxarray", specifier = ">=0.17" }, { name = "starlette", specifier = ">=0.27.0" }, { name = "topozarr", specifier = "==0.0.*" }, @@ -2258,6 +2260,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" }, ] +[[package]] +name = "python-multipart" +version = "0.0.29" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4e/fe/70bd71a6738b09a0bdf6480ca6436b167469ca4578b2a0efbe390b4b0e70/python_multipart-0.0.29.tar.gz", hash = "sha256:643e93849196645e2dbdd81a0f8829a23123ad7f797a84a364c6fb3563f18904", size = 45678, upload-time = "2026-05-17T17:29:47.654Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/cb/769cfc37177252872a45a71f3fbdde9d51b471a3f3c14bfe95dde3407386/python_multipart-0.0.29-py3-none-any.whl", hash = "sha256:2ddcc971cef266225f54f552d8fa10bcfbb1f14446caec199060daac59ff2d69", size = 29640, upload-time = "2026-05-17T17:29:45.69Z" }, +] + [[package]] name = "pytz" version = "2026.2" From ad867d1e5da3d20a3947ea9c9e4832e603e38fc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 16:01:32 +0200 Subject: [PATCH 62/80] fix: expand scalar time coord before passing to xstac xstac crashes with ValueError when the time dimension has only one element (0-d scalar after sel). Expand to a 1-element array so min/max concatenation succeeds. Fixes 503 when viewing a dataset whose first ingest batch has only committed a single period. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/stac/services.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index 122f1b1a..974d427b 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -222,6 +222,10 @@ def _build_collection_with_xstac(*, artifact: ArtifactRecord, template: pystac.C reference_system = int(detected_crs.split(":")[-1]) if detected_crs else 4326 except ValueError: reference_system = 4326 + # xstac crashes on a scalar (0-d) time coordinate when computing + # min/max for the temporal extent. Expand to a 1-element array first. + if time_dimension and time_dimension in ds.coords and ds[time_dimension].ndim == 0: + ds = ds.expand_dims(time_dimension) result = xarray_to_stac( ds, template, From ce1760d09b1ea7e1d6e67fb875976fd713d8598d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 16:08:21 +0200 Subject: [PATCH 63/80] fix: hourly end date normalizes to T23 when given a bare date A date-only end like '2026-01-01' was normalizing to T00, causing hourly ingests to include only a single period instead of all 24 hours of the day. Now is_end=True maps a date-only input to T23. Also expand scalar time dimension before xstac call to avoid a crash when a dataset contains exactly one committed period. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingestions/services.py | 18 +++++++++++------- climate_api/shared/time.py | 15 ++++++++++++--- climate_api/stac/services.py | 2 +- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 68e21337..21b42651 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -207,11 +207,13 @@ def create_artifact( """Ingest a dataset via its plugin, persist it locally, and store artifact metadata.""" period_type = str(dataset["period_type"]) start = _normalize_request_period(start, period_type=period_type, field_name="start") - end = _normalize_optional_request_period(end, period_type=period_type, field_name="end") + end = _normalize_optional_request_period(end, period_type=period_type, field_name="end", is_end=True) download_start = _normalize_optional_request_period( download_start, period_type=period_type, field_name="download_start" ) - download_end = _normalize_optional_request_period(download_end, period_type=period_type, field_name="download_end") + download_end = _normalize_optional_request_period( + download_end, period_type=period_type, field_name="download_end", is_end=True + ) _validate_download_scope( start=start, end=end, @@ -445,7 +447,7 @@ def store_materialized_zarr_artifact( """Store metadata for a locally materialized Zarr artifact.""" period_type = str(dataset["period_type"]) normalized_start = _normalize_request_period(start, period_type=period_type, field_name="start") - normalized_end = _normalize_optional_request_period(end, period_type=period_type, field_name="end") + normalized_end = _normalize_optional_request_period(end, period_type=period_type, field_name="end", is_end=True) request_scope = ArtifactRequestScope( start=normalized_start, end=normalized_end, @@ -951,10 +953,10 @@ def _find_existing_artifact( ) -def _normalize_request_period(value: str, *, period_type: str, field_name: str) -> str: +def _normalize_request_period(value: str, *, period_type: str, field_name: str, is_end: bool = False) -> str: """Normalize a required request period or raise a clear client error.""" try: - return normalize_period_string(value, period_type) + return normalize_period_string(value, period_type, is_end=is_end) except (TypeError, ValueError) as exc: raise HTTPException( status_code=400, @@ -962,11 +964,13 @@ def _normalize_request_period(value: str, *, period_type: str, field_name: str) ) from exc -def _normalize_optional_request_period(value: str | None, *, period_type: str, field_name: str) -> str | None: +def _normalize_optional_request_period( + value: str | None, *, period_type: str, field_name: str, is_end: bool = False +) -> str | None: """Normalize an optional request period or raise a clear client error.""" if value is None: return None - return _normalize_request_period(value, period_type=period_type, field_name=field_name) + return _normalize_request_period(value, period_type=period_type, field_name=field_name, is_end=is_end) def _default_request_end(period_type: str) -> str: diff --git a/climate_api/shared/time.py b/climate_api/shared/time.py index 922f65fe..1a871e37 100644 --- a/climate_api/shared/time.py +++ b/climate_api/shared/time.py @@ -132,11 +132,20 @@ def parse_weekly_period_string(value: str) -> datetime: return datetime.fromisoformat(value) -def normalize_period_string(value: str, period_type: str) -> str: - """Normalize an input period string to the dataset-native period format.""" +def normalize_period_string(value: str, period_type: str, *, is_end: bool = False) -> str: + """Normalize an input period string to the dataset-native period format. + + When is_end=True and period_type='hourly', a date-only input (YYYY-MM-DD) + is treated as the last hour of that day (T23) rather than T00. + """ if period_type == "hourly": try: - return datetime_to_period_string(parse_hourly_period_string(value), period_type) + dt = parse_hourly_period_string(value) + # A bare date with no time component defaults to midnight; for an end + # bound that means the user intended the last hour of the day. + if is_end and len(value) == 10: + dt = dt.replace(hour=23) + return datetime_to_period_string(dt, period_type) except ValueError as exc: raise ValueError(f"Invalid hourly period '{value}'; expected YYYY-MM-DDTHH or ISO datetime") from exc if period_type == "daily": diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index 974d427b..3ad390e6 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -224,7 +224,7 @@ def _build_collection_with_xstac(*, artifact: ArtifactRecord, template: pystac.C reference_system = 4326 # xstac crashes on a scalar (0-d) time coordinate when computing # min/max for the temporal extent. Expand to a 1-element array first. - if time_dimension and time_dimension in ds.coords and ds[time_dimension].ndim == 0: + if time_dimension and hasattr(ds, "coords") and time_dimension in ds.coords and ds[time_dimension].ndim == 0: ds = ds.expand_dims(time_dimension) result = xarray_to_stac( ds, From cdcc94a5a952397db28e8c997b157c2039df2727 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 16:45:30 +0200 Subject: [PATCH 64/80] fix: get_time_dim checks actual dims, not just coordinates ERA5-Land stores have a scalar valid_time coordinate alongside the true time dimension. The old hasattr check matched the scalar first, giving coverage start == end == last-fetched-hour instead of the full range. Checking ds.dims instead ensures only actual array dimensions are returned. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/data_manager/services/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/climate_api/data_manager/services/utils.py b/climate_api/data_manager/services/utils.py index f4e74dee..721febe7 100644 --- a/climate_api/data_manager/services/utils.py +++ b/climate_api/data_manager/services/utils.py @@ -5,8 +5,9 @@ def get_time_dim(ds: Any) -> str: """Return the name of the time dimension in a dataset or dataframe.""" + actual_dims: set[str] = set(getattr(ds, "dims", {}) or {}) for time_name in ["valid_time", "time"]: - if hasattr(ds, time_name): + if time_name in actual_dims: return time_name raise ValueError(f"Unable to find time dimension: {getattr(ds, 'coords', repr(ds))}") From 206702881e4ae642373203c04b45b223775d380a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 16:50:57 +0200 Subject: [PATCH 65/80] fix: detect WGS84 from coordinate units when spatial_ref is absent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Datasets like ERA5-Land have no spatial_ref coordinate but do have x/y dimensions with units=degrees_east/degrees_north. The old detection only checked spatial_ref, causing the deployment CRS (EPSG:32633 for Norway) to be used instead of EPSG:4326 — producing a wrong proj:code in the STAC collection and a garbage bbox from a false UTM→WGS84 reprojection. Now both _detect_dataset_crs and _read_crs_from_spatial_ref fall back to dimension attribute inspection. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingestions/services.py | 42 ++++++++++++++++++--------- climate_api/stac/services.py | 46 +++++++++++++++++++----------- 2 files changed, 57 insertions(+), 31 deletions(-) diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 21b42651..d10b2c15 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -79,20 +79,34 @@ def _check_bbox_overlap(dataset: dict[str, object], instance_bbox: list[float]) def _read_crs_from_spatial_ref(ds: xr.Dataset) -> str | None: - """Return 'EPSG:' from a dataset's spatial_ref coordinate, or None.""" - if "spatial_ref" not in ds.coords: - return None - try: - import pyproj - - attrs = dict(ds["spatial_ref"].attrs) - wkt = attrs.get("crs_wkt") or attrs.get("spatial_ref") - if not wkt: - return None - epsg = pyproj.CRS.from_wkt(str(wkt)).to_epsg() - return f"EPSG:{epsg}" if epsg else None - except Exception: - return None + """Return an EPSG CRS string from a dataset, or None if undetectable. + + Checks spatial_ref WKT first, then falls back to dimension units/standard_name + so that datasets like ERA5-Land (no spatial_ref, but degrees_east/north units) + are not misidentified as projected. + """ + if "spatial_ref" in ds.coords: + try: + import pyproj + + attrs = dict(ds["spatial_ref"].attrs) + wkt = attrs.get("crs_wkt") or attrs.get("spatial_ref") + if wkt: + epsg = pyproj.CRS.from_wkt(str(wkt)).to_epsg() + if epsg: + return f"EPSG:{epsg}" + except Exception: + pass + for dim in set(ds.dims): + if dim not in ds.coords: + continue + attrs = dict(ds[dim].attrs) + if attrs.get("units") in ("degrees_east", "degrees_north") or attrs.get("standard_name") in ( + "longitude", + "latitude", + ): + return "EPSG:4326" + return None def _resolve_artifacts_dir() -> Path: diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index 3ad390e6..346e5f3b 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -515,23 +515,35 @@ def _zarr_consolidated_flag(artifact_path: str) -> bool | None: def _detect_dataset_crs(ds: Any) -> str | None: - """Read the EPSG CRS code from a dataset's spatial_ref coordinate, if present. + """Read the EPSG CRS code from a dataset, or None if undetectable. - Returns a string like 'EPSG:4326' or None if undetectable. Used to override - the deployment-wide proj:code with the actual native CRS of the data so that - datasets stored in WGS84 (e.g. WorldPop) are not misidentified as projected. + Checks (in order): spatial_ref WKT coordinate, then dimension units/standard_name. + Used to override the deployment-wide proj:code with the actual native CRS of the + data so that datasets stored in WGS84 (e.g. ERA5-Land, WorldPop) are not + misidentified as projected. """ - if not hasattr(ds, "coords") or "spatial_ref" not in ds.coords: - return None - try: - import pyproj - - attrs = dict(ds["spatial_ref"].attrs) - wkt = attrs.get("crs_wkt") or attrs.get("spatial_ref") - if not wkt: - return None - crs = pyproj.CRS.from_wkt(str(wkt)) - epsg = crs.to_epsg() - return f"EPSG:{epsg}" if epsg else None - except Exception: + if not hasattr(ds, "coords"): return None + if "spatial_ref" in ds.coords: + try: + import pyproj + + attrs = dict(ds["spatial_ref"].attrs) + wkt = attrs.get("crs_wkt") or attrs.get("spatial_ref") + if wkt: + crs = pyproj.CRS.from_wkt(str(wkt)) + epsg = crs.to_epsg() + if epsg: + return f"EPSG:{epsg}" + except Exception: + pass + for dim in set(getattr(ds, "dims", {})): + if dim not in ds.coords: + continue + attrs = dict(ds[dim].attrs) + if attrs.get("units") in ("degrees_east", "degrees_north") or attrs.get("standard_name") in ( + "longitude", + "latitude", + ): + return "EPSG:4326" + return None From cf5a443b6e9b7d72f9e82e900e6a75659652b943 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 17:24:00 +0200 Subject: [PATCH 66/80] fix: detect pyramid Icechunk stores for correct STAC zarr href _is_pyramid_zarr only checked for a "0/" subdirectory, which exists in flat zarr pyramid stores but not in Icechunk stores. Icechunk stores use an opaque internal layout (chunks/, manifests/, etc.) regardless of whether the data has pyramid sub-groups. Now falls back to opening the Icechunk store and checking if "0" is a root group member, so that the STAC zarr asset href correctly points to /zarr/{id}/0 instead of the root group, allowing the map viewer to render pyramid data. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/stac/services.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index 346e5f3b..0571cf1c 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -320,7 +320,22 @@ def _is_pyramid_zarr(artifact_path: str) -> bool: """Return True if artifact_path is a multiscale pyramid zarr store.""" if "://" in artifact_path: return False - return (Path(artifact_path) / "0").is_dir() + path = Path(artifact_path) + if (path / "0").is_dir(): + return True + if path.suffix == ".icechunk": + try: + import zarr + + from climate_api.ingest.store import open_or_create_repo + + repo = open_or_create_repo(path) + session = repo.readonly_session("main") + root = zarr.open_group(session.store, mode="r") + return "0" in root + except Exception: + return False + return False def _abs_url(request: Request, path: str) -> str: From 0e95ec4a9cf8d0afd636bdd23cce5c3bdc0452ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 18:35:44 +0200 Subject: [PATCH 67/80] fix: reduce CHIRPS3 max_concurrency to 1 to avoid rate-limit bans Sending 4 concurrent requests to data.chc.ucsb.edu triggered a CrowdSec IP ban (HTTP 403). Serial fetching is sufficient; each COG range request completes in under a second. Also adds retry logic for 429/503 responses. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/plugins/chirps3.py | 18 ++++++++++++++++-- tests/test_ingest_plugins.py | 2 +- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/climate_api/ingest/plugins/chirps3.py b/climate_api/ingest/plugins/chirps3.py index 98fd860d..34e8f0bd 100644 --- a/climate_api/ingest/plugins/chirps3.py +++ b/climate_api/ingest/plugins/chirps3.py @@ -20,6 +20,7 @@ import calendar import logging +import time from datetime import date from typing import Any @@ -47,7 +48,7 @@ class Chirps3Plugin: 'sat' for prelim. Defaults to 'rnl' (final/rnl recommended). """ - max_concurrency = 4 + max_concurrency = 1 commit_batch_size = 30 rechunk_time = 30 @@ -91,7 +92,20 @@ def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Datase url = self._url_for_day(d) logger.info("Fetching CHIRPS3 %s: %s", period_id, url) - da = rioxarray.open_rasterio(url, chunks=None, masked=True, lock=False) + for attempt in range(3): + try: + da = rioxarray.open_rasterio(url, chunks=None, masked=True, lock=False) + break + except Exception as exc: + msg = str(exc) + if "429" in msg or "503" in msg: + wait = 10 * (2**attempt) + logger.warning("CHIRPS3 HTTP error (%s), retrying in %ds: %s", msg[:60], wait, url) + time.sleep(wait) + if attempt == 2: + raise + else: + raise if not isinstance(da, xr.DataArray): raise TypeError(f"rioxarray.open_rasterio returned {type(da).__name__!r}, expected DataArray") xmin, ymin, xmax, ymax = map(float, bbox) diff --git a/tests/test_ingest_plugins.py b/tests/test_ingest_plugins.py index a5b8f7f4..05304ede 100644 --- a/tests/test_ingest_plugins.py +++ b/tests/test_ingest_plugins.py @@ -204,7 +204,7 @@ def test_satisfies_protocol(self) -> None: assert isinstance(plugin, IngestionPlugin) def test_max_concurrency(self) -> None: - assert self._make_plugin().max_concurrency == 4 + assert self._make_plugin().max_concurrency == 1 def test_commit_batch_size(self) -> None: assert self._make_plugin().commit_batch_size == 30 From 464299f520020c9d6353b1f565fc0d332b995891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 19:14:56 +0200 Subject: [PATCH 68/80] fix: handle timeless datasets in coverage_from_open_dataset Static stores (time_dim=False, e.g. Copernicus DEM) have no time coordinate. get_time_dim raises ValueError; catch it and return start=end=None so artifact creation completes normally. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/data_accessor/services/accessor.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/climate_api/data_accessor/services/accessor.py b/climate_api/data_accessor/services/accessor.py index f1307398..62934b48 100644 --- a/climate_api/data_accessor/services/accessor.py +++ b/climate_api/data_accessor/services/accessor.py @@ -143,11 +143,14 @@ def _coverage_from_dataset(*, ds: xr.Dataset, period_type: str, native_crs: str }, } - time_dim = get_time_dim(ds) x_dim, y_dim = get_x_y_dims(ds) - start = _period_string_scalar(numpy_datetime_to_period_string(ds[time_dim].min(), period_type)) # type: ignore[arg-type] - end = _period_string_scalar(numpy_datetime_to_period_string(ds[time_dim].max(), period_type)) # type: ignore[arg-type] + try: + time_dim = get_time_dim(ds) + start = _period_string_scalar(numpy_datetime_to_period_string(ds[time_dim].min(), period_type)) # type: ignore[arg-type] + end = _period_string_scalar(numpy_datetime_to_period_string(ds[time_dim].max(), period_type)) # type: ignore[arg-type] + except ValueError: + start = end = None xmin, xmax = ds[x_dim].min().item(), ds[x_dim].max().item() ymin, ymax = ds[y_dim].min().item(), ds[y_dim].max().item() From fa44c5701e5527145a10438c5bc5853d864a6dda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 19:16:14 +0200 Subject: [PATCH 69/80] fix: detect completed timeless stores in read_committed_period_ids A static (time_dim=False) store has no time coordinate, so read_committed_period_ids previously always returned empty, causing every re-ingest to re-fetch all source tiles. If the store has spatial data with non-zero dimensions, return {"static"} so the orchestrator treats it as already complete and skips the download. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/store.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/climate_api/ingest/store.py b/climate_api/ingest/store.py index 0ddfbaea..00a5a5a1 100644 --- a/climate_api/ingest/store.py +++ b/climate_api/ingest/store.py @@ -162,6 +162,9 @@ def read_committed_period_ids(store_path: Path, period_type: str) -> set[str]: ds.close() ds = xr.open_zarr(session.store, group="0") if "time" not in ds.coords: + # Timeless (static) store: if it has spatial data, treat as complete. + if ds.sizes and all(s > 0 for s in ds.sizes.values()): + return {"static"} return set() import pandas as pd From 54048df81544c4f474dcd3e0c2b06bbd4ba458c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 19:21:45 +0200 Subject: [PATCH 70/80] fix: allow null temporal coverage for static (timeless) datasets CoverageTemporal.start/end now accept None so that datasets ingested with time_dim=False (e.g. Copernicus DEM elevation) can be recorded without a temporal extent. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingestions/schemas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/climate_api/ingestions/schemas.py b/climate_api/ingestions/schemas.py index 3c267993..f5becb6b 100644 --- a/climate_api/ingestions/schemas.py +++ b/climate_api/ingestions/schemas.py @@ -53,8 +53,8 @@ class CoverageSpatial(BaseModel): class CoverageTemporal(BaseModel): """Temporal extent summary.""" - start: str = Field(description="First covered time period in dataset-native string form.") - end: str = Field(description="Last covered time period in dataset-native string form.") + start: str | None = Field(description="First covered time period in dataset-native string form. None for static (timeless) datasets.") + end: str | None = Field(description="Last covered time period in dataset-native string form. None for static (timeless) datasets.") class ArtifactCoverage(BaseModel): From 99db6fc008ab7cec3c00f40a9c4f078bc4fa1e32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 19:23:53 +0200 Subject: [PATCH 71/80] fix: handle timeless datasets in STAC collection building MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two crashes for static stores (time_dim=False): 1. parse_period_string_to_datetime(None) in temporal extent — guard with conditional so None start/end pass through as None (valid STAC). 2. get_time_dim(ds) raises ValueError — catch it and set time_dimension=None so xstac omits the time cube:dimension for timeless datasets. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/stac/services.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index 0571cf1c..69971f8d 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -155,7 +155,10 @@ def _build_collection_template( extent=pystac.Extent( spatial=pystac.SpatialExtent([[spatial.xmin, spatial.ymin, spatial.xmax, spatial.ymax]]), temporal=pystac.TemporalExtent( - [[parse_period_string_to_datetime(temporal.start), parse_period_string_to_datetime(temporal.end)]] + [[ + parse_period_string_to_datetime(temporal.start) if temporal.start else None, + parse_period_string_to_datetime(temporal.end) if temporal.end else None, + ]] ), ), title=artifact.dataset_name, @@ -211,7 +214,10 @@ def _build_collection_with_xstac(*, artifact: ArtifactRecord, template: pystac.C ) from exc try: x_dimension, y_dimension = get_x_y_dims(ds) - time_dimension = get_time_dim(ds) + try: + time_dimension = get_time_dim(ds) + except ValueError: + time_dimension = None # Detect the actual data CRS so proj:code reflects the store's native coordinate # system rather than the deployment CRS. This matters when a dataset (e.g. WorldPop) # is stored in WGS84 while the deployment is configured for a projected CRS. From a830268795180ab18eedb065c10e4b526b823b2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 19:25:07 +0200 Subject: [PATCH 72/80] fix: skip xstac for timeless stores, build spatial cube:dimensions manually xstac requires a temporal_dimension and raises KeyError when None is passed and no CF T axis exists. For static (time_dim=False) datasets, bypass xstac entirely and build only x/y spatial cube:dimensions. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/stac/services.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index 69971f8d..98b5a7e3 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -228,9 +228,20 @@ def _build_collection_with_xstac(*, artifact: ArtifactRecord, template: pystac.C reference_system = int(detected_crs.split(":")[-1]) if detected_crs else 4326 except ValueError: reference_system = 4326 + if time_dimension is None: + # xstac requires a temporal dimension; skip it for timeless (static) + # stores and build only spatial cube:dimensions by hand. + payload = template.to_dict(include_self_link=False) + payload["cube:dimensions"] = { + x_dimension: {"type": "spatial", "axis": "x", "reference_system": reference_system}, + y_dimension: {"type": "spatial", "axis": "y", "reference_system": reference_system}, + } + _cache_xstac_collection_payload(artifact.artifact_id, payload) + return deepcopy(payload) + # xstac crashes on a scalar (0-d) time coordinate when computing # min/max for the temporal extent. Expand to a 1-element array first. - if time_dimension and hasattr(ds, "coords") and time_dimension in ds.coords and ds[time_dimension].ndim == 0: + if hasattr(ds, "coords") and time_dimension in ds.coords and ds[time_dimension].ndim == 0: ds = ds.expand_dims(time_dimension) result = xarray_to_stac( ds, @@ -246,7 +257,7 @@ def _build_collection_with_xstac(*, artifact: ArtifactRecord, template: pystac.C # clear xstac/pystac-owned links before serialization to avoid root-link # resolution attempts during to_dict(). result.clear_links() - payload: dict[str, Any] = result.to_dict(include_self_link=False) + payload = result.to_dict(include_self_link=False) _cache_xstac_collection_payload(artifact.artifact_id, payload) return deepcopy(payload) except HTTPException: From 2640ccb798bfff852717c004ceaa66a9049cfb09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 19:26:33 +0200 Subject: [PATCH 73/80] fix: guard _override_temporal_extent_from_artifact against None start/end Static datasets have temporal.start=None and temporal.end=None. Return early with [[null, null]] STAC interval instead of crashing on parse_period_string_to_datetime(None). Co-Authored-By: Claude Sonnet 4.6 --- climate_api/stac/services.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index 98b5a7e3..854a3619 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -393,14 +393,12 @@ def _override_spatial_extent_from_artifact(collection: dict[str, Any], artifact: def _override_temporal_extent_from_artifact(collection: dict[str, Any], artifact: ArtifactRecord) -> None: temporal = artifact.coverage.temporal + if temporal.start is None and temporal.end is None: + collection["extent"]["temporal"]["interval"] = [[None, None]] + return start = parse_period_string_to_datetime(temporal.start).isoformat().replace("+00:00", "Z") end = parse_period_string_to_datetime(temporal.end).isoformat().replace("+00:00", "Z") - collection["extent"]["temporal"]["interval"] = [ - [ - start, - end, - ] - ] + collection["extent"]["temporal"]["interval"] = [[start, end]] dimensions = collection.setdefault("cube:dimensions", {}) for key, value in dimensions.items(): if isinstance(value, dict) and value.get("type") == "temporal": From 258b0677f400175214e87c94e600ec594df2ca92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 19:53:53 +0200 Subject: [PATCH 74/80] fix: expose pyramid root URL to zarr-layer for multiscale selection zarr-layer reads the multiscales attribute at the root to select the appropriate overview level based on viewport zoom. Pointing it at /0 (full resolution) bypassed this and caused all-chunks-at-full-res rendering. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/stac/services.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index 854a3619..c4b5673d 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -328,8 +328,6 @@ def _public_zarr_asset_href( source_dataset: dict[str, Any], ) -> str: artifact_path = _artifact_store_path(artifact) - if _is_pyramid_zarr(artifact_path): - return _abs_url(request, f"/zarr/{dataset_id}/0") return _abs_url(request, f"/zarr/{dataset_id}") From 7c20b97be5988027577e6a6e1512198e715792e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 20:36:28 +0200 Subject: [PATCH 75/80] feat: palette LUT support for categorical datasets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a display.palette alternative to display.colormap + range in dataset YAML. A palette is a dict mapping uint8 pixel value → hex color, which is emitted as climate_api:palette in the STAC renders object. The map viewer builds a 256-entry LUT directly (lut[value] = color) and passes it to zarr-layer with clim=[0,255] so no rescaling occurs — each class value indexes its own color. Legend range labels are suppressed for palette mode since the axis has no numeric meaning. Co-Authored-By: Claude Sonnet 4.6 --- climate_api/stac/services.py | 14 ++++++++++---- climate_api/templates/map-viewer.html | 20 +++++++++++++++----- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index c4b5673d..25ea9c72 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -504,15 +504,21 @@ def _build_renders(artifact: ArtifactRecord, source_dataset: dict[str, Any]) -> return None colormap_name = display.get("colormap") value_range = display.get("range") - if not isinstance(colormap_name, str) or not isinstance(value_range, list) or len(value_range) != 2: - return None + palette = display.get("palette") + render: dict[str, Any] = { "title": artifact.dataset_name, "assets": ["zarr"], - "rescale": [[float(value_range[0]), float(value_range[1])]], - "colormap_name": colormap_name, "climate_api:variable": artifact.variable, } + + if isinstance(palette, dict) and palette: + render["climate_api:palette"] = {str(k): str(v) for k, v in palette.items()} + elif isinstance(colormap_name, str) and isinstance(value_range, list) and len(value_range) == 2: + render["colormap_name"] = colormap_name + render["rescale"] = [[float(value_range[0]), float(value_range[1])]] + else: + return None nodata = display.get("nodata") if nodata is not None: render["nodata"] = float(nodata) diff --git a/climate_api/templates/map-viewer.html b/climate_api/templates/map-viewer.html index 094d2ebf..0f9dc032 100644 --- a/climate_api/templates/map-viewer.html +++ b/climate_api/templates/map-viewer.html @@ -261,6 +261,15 @@

Climate API

} } + function buildPaletteLut(palette) { + const lut = new Array(256).fill("#000000"); + for (const [value, color] of Object.entries(palette)) { + const idx = parseInt(value, 10); + if (idx >= 0 && idx < 256) lut[idx] = color; + } + return lut; + } + // Resolve the temporal dimension key and step list from cube:dimensions. function getTimeDimKey(dimensions) { for (const [key, val] of Object.entries(dimensions ?? {})) { @@ -345,8 +354,8 @@

Climate API

(_, i) => cm[Math.round((i * (cm.length - 1)) / 31)] ); legendBar.style.background = `linear-gradient(to right, ${stops.join(", ")})`; - legendMin.textContent = clim[0]; - legendMax.textContent = clim[1]; + legendMin.textContent = clim ? clim[0] : ""; + legendMax.textContent = clim ? clim[1] : ""; legendUnits.textContent = units ? `(${units})` : ""; legendEl.classList.remove("hidden"); } @@ -462,7 +471,8 @@

Climate API

return; } - const clim = renders.rescale?.[0] ?? [0, 100]; + const palette = renders["climate_api:palette"] ?? null; + const clim = palette ? [0, 255] : (renders.rescale?.[0] ?? [0, 100]); const colormapName = renders.colormap_name ?? "viridis"; const fillValue = renders.nodata ?? null; const variable = @@ -498,7 +508,7 @@

Climate API

let cm; try { - cm = buildColormap(colormapName); + cm = palette ? buildPaletteLut(palette) : buildColormap(colormapName); const zarrVersion = zarr["zarr:zarr_format"] ?? null; const selector = timeStepCount() > 0 ? { [timeDimKey]: 0 } : {}; @@ -552,7 +562,7 @@

Climate API

metaUnits.textContent = units || "—"; datasetMeta.classList.remove("hidden"); - updateLegend(cm, clim, units); + updateLegend(cm, palette ? null : clim, units); setStatus(""); } From 4c22d02545e788fa9410d274e9cb85a865c2ef77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 22:17:16 +0200 Subject: [PATCH 76/80] fix: resolve ruff lint errors after merge with main - Wrap long Field descriptions in ingestions/schemas.py - Remove unused artifact_path assignment in stac/services.py Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingestions/schemas.py | 8 ++++++-- climate_api/stac/services.py | 11 ++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/climate_api/ingestions/schemas.py b/climate_api/ingestions/schemas.py index f5becb6b..74150e7d 100644 --- a/climate_api/ingestions/schemas.py +++ b/climate_api/ingestions/schemas.py @@ -53,8 +53,12 @@ class CoverageSpatial(BaseModel): class CoverageTemporal(BaseModel): """Temporal extent summary.""" - start: str | None = Field(description="First covered time period in dataset-native string form. None for static (timeless) datasets.") - end: str | None = Field(description="Last covered time period in dataset-native string form. None for static (timeless) datasets.") + start: str | None = Field( + description="First covered time period in dataset-native string form. None for static (timeless) datasets." + ) + end: str | None = Field( + description="Last covered time period in dataset-native string form. None for static (timeless) datasets." + ) class ArtifactCoverage(BaseModel): diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index 25ea9c72..da6a7151 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -155,10 +155,12 @@ def _build_collection_template( extent=pystac.Extent( spatial=pystac.SpatialExtent([[spatial.xmin, spatial.ymin, spatial.xmax, spatial.ymax]]), temporal=pystac.TemporalExtent( - [[ - parse_period_string_to_datetime(temporal.start) if temporal.start else None, - parse_period_string_to_datetime(temporal.end) if temporal.end else None, - ]] + [ + [ + parse_period_string_to_datetime(temporal.start) if temporal.start else None, + parse_period_string_to_datetime(temporal.end) if temporal.end else None, + ] + ] ), ), title=artifact.dataset_name, @@ -327,7 +329,6 @@ def _public_zarr_asset_href( artifact: ArtifactRecord, source_dataset: dict[str, Any], ) -> str: - artifact_path = _artifact_store_path(artifact) return _abs_url(request, f"/zarr/{dataset_id}") From 659112980d24dd3b886be52d60d4c30676437b2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 22:21:29 +0200 Subject: [PATCH 77/80] fix: resolve all mypy type errors - accessor.py: annotate start/end as str | None before try/except - sync_engine.py: guard against None current_end for temporal/release sync - stac/services.py: handle None temporal bounds independently when formatting - test_stac.py: update pyramid href assertions to expect root URL (not /0) Co-Authored-By: Claude Sonnet 4.6 --- climate_api/data_accessor/services/accessor.py | 2 ++ climate_api/ingestions/sync_engine.py | 2 ++ climate_api/stac/services.py | 11 ++++++----- tests/test_stac.py | 8 ++++---- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/climate_api/data_accessor/services/accessor.py b/climate_api/data_accessor/services/accessor.py index 62934b48..6fbdd0e4 100644 --- a/climate_api/data_accessor/services/accessor.py +++ b/climate_api/data_accessor/services/accessor.py @@ -145,6 +145,8 @@ def _coverage_from_dataset(*, ds: xr.Dataset, period_type: str, native_crs: str x_dim, y_dim = get_x_y_dims(ds) + start: str | None + end: str | None try: time_dim = get_time_dim(ds) start = _period_string_scalar(numpy_datetime_to_period_string(ds[time_dim].min(), period_type)) # type: ignore[arg-type] diff --git a/climate_api/ingestions/sync_engine.py b/climate_api/ingestions/sync_engine.py index b084da2a..f95c8604 100644 --- a/climate_api/ingestions/sync_engine.py +++ b/climate_api/ingestions/sync_engine.py @@ -77,6 +77,8 @@ def plan_sync( target_end=current_end, target_end_source="current_coverage", ) + if current_end is None: + raise ValueError(f"Cannot plan sync for {sync_kind.value} dataset with no existing temporal coverage") period_type = str(source_dataset["period_type"]) normalized_requested_end = requested_end.strip() if isinstance(requested_end, str) else None normalized_requested_end = normalized_requested_end or None diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index da6a7151..fa00abf9 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -392,11 +392,12 @@ def _override_spatial_extent_from_artifact(collection: dict[str, Any], artifact: def _override_temporal_extent_from_artifact(collection: dict[str, Any], artifact: ArtifactRecord) -> None: temporal = artifact.coverage.temporal - if temporal.start is None and temporal.end is None: - collection["extent"]["temporal"]["interval"] = [[None, None]] - return - start = parse_period_string_to_datetime(temporal.start).isoformat().replace("+00:00", "Z") - end = parse_period_string_to_datetime(temporal.end).isoformat().replace("+00:00", "Z") + + def _fmt(period: str | None) -> str | None: + return parse_period_string_to_datetime(period).isoformat().replace("+00:00", "Z") if period else None + + start = _fmt(temporal.start) + end = _fmt(temporal.end) collection["extent"]["temporal"]["interval"] = [[start, end]] dimensions = collection.setdefault("cube:dimensions", {}) for key, value in dimensions.items(): diff --git a/tests/test_stac.py b/tests/test_stac.py index 0736bb55..7e532561 100644 --- a/tests/test_stac.py +++ b/tests/test_stac.py @@ -342,7 +342,7 @@ def test_collection_sets_hourly_step_to_pt1h(client: TestClient, monkeypatch: py assert payload["cube:dimensions"]["valid_time"]["step"] == "PT1H" -def test_collection_uses_level0_href_for_pyramid_zarr_store( +def test_collection_uses_root_href_for_pyramid_zarr_store( client: TestClient, monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" @@ -377,10 +377,10 @@ def test_collection_uses_level0_href_for_pyramid_zarr_store( assert response.status_code == 200 payload = response.json() - assert payload["assets"]["zarr"]["href"].endswith("/zarr/chirps3_precipitation_daily/0") + assert payload["assets"]["zarr"]["href"].endswith("/zarr/chirps3_precipitation_daily") -def test_collection_uses_level0_href_for_remote_pyramid_zarr_store( +def test_collection_uses_root_href_for_remote_pyramid_zarr_store( client: TestClient, monkeypatch: pytest.MonkeyPatch ) -> None: artifact = _artifact(artifact_id="a1", path="s3://example-bucket/chirps3_precipitation_daily.zarr") @@ -414,7 +414,7 @@ def test_collection_uses_level0_href_for_remote_pyramid_zarr_store( assert response.status_code == 200 payload = response.json() - assert payload["assets"]["zarr"]["href"].endswith("/zarr/chirps3_precipitation_daily/0") + assert payload["assets"]["zarr"]["href"].endswith("/zarr/chirps3_precipitation_daily") def test_collection_returns_404_for_unknown_dataset(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: From f70d3e742b709c46032c6baebdf4f7f8943b0a2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 22:22:58 +0200 Subject: [PATCH 78/80] fix: initialise da=None before retry loop to satisfy pyright Co-Authored-By: Claude Sonnet 4.6 --- climate_api/ingest/plugins/chirps3.py | 1 + 1 file changed, 1 insertion(+) diff --git a/climate_api/ingest/plugins/chirps3.py b/climate_api/ingest/plugins/chirps3.py index 34e8f0bd..c23224e9 100644 --- a/climate_api/ingest/plugins/chirps3.py +++ b/climate_api/ingest/plugins/chirps3.py @@ -92,6 +92,7 @@ def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Datase url = self._url_for_day(d) logger.info("Fetching CHIRPS3 %s: %s", period_id, url) + da = None for attempt in range(3): try: da = rioxarray.open_rasterio(url, chunks=None, masked=True, lock=False) From eb2736e985998c00e044b094281b0fd74d02aa60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 22:55:51 +0200 Subject: [PATCH 79/80] docs: remove outdated single-CRS constraint from zarr_and_geozarr.md Co-Authored-By: Claude Sonnet 4.6 --- docs/zarr_and_geozarr.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/zarr_and_geozarr.md b/docs/zarr_and_geozarr.md index df78bad3..252edbec 100644 --- a/docs/zarr_and_geozarr.md +++ b/docs/zarr_and_geozarr.md @@ -38,7 +38,6 @@ The two halves of the term map directly onto the choices described in this docum **Analysis-ready** means a consumer can open the data and start computing without preprocessing: - Dimension names are normalised to `(time, x, y)` regardless of the source convention. -- All datasets in an instance share a single coordinate reference system. - Units are standardised by the transform pipeline (e.g. Kelvin → Celsius). **Cloud-optimized** means the data can be accessed efficiently over HTTP without downloading the whole file. The Zarr and GeoZarr formats provide all the necessary properties — chunk-level access, HTTP-native serving, multiscale pyramids, and cloud compatibility. From 2c56fbb26224c80c1cb0837f355b120cb966f601 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Sandvik?= Date: Wed, 20 May 2026 22:57:05 +0200 Subject: [PATCH 80/80] docs: move Store layout on disk section after Icechunk section Co-Authored-By: Claude Sonnet 4.6 --- docs/zarr_and_geozarr.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/zarr_and_geozarr.md b/docs/zarr_and_geozarr.md index 252edbec..89132dea 100644 --- a/docs/zarr_and_geozarr.md +++ b/docs/zarr_and_geozarr.md @@ -46,17 +46,6 @@ The Climate API targets the same access pattern at country scale for arbitrary s --- -## Store layout on disk - -Each managed dataset has exactly one Icechunk repository on disk, stored under `{data_dir}/downloads/{dataset_id}.icechunk`. The zarr content inside the repository is either: - -- **Flat** — a single-resolution store with dimensions `(time, x, y)` -- **Pyramid** — a multi-resolution store with levels `0/`, `1/`, `2/`, … where `0/` is the full resolution - -The flat vs. pyramid decision is made at ingest time based on spatial size (see [Multiscale pyramids](#multiscale-pyramids) below). - ---- - ## Icechunk — versioned Zarr storage [Icechunk](https://icechunk.io) is a transactional storage layer that sits between the application and the underlying Zarr v3 data. It exposes a standard Zarr store interface to writers and readers, but adds **MVCC (multi-version concurrency control)**: every write is committed as an immutable snapshot, and readers always see a consistent view of the data regardless of concurrent writes. @@ -92,6 +81,17 @@ Zarr keys are read directly from the Icechunk session store rather than from fil --- +## Store layout on disk + +Each managed dataset has exactly one Icechunk repository on disk, stored under `{data_dir}/downloads/{dataset_id}.icechunk`. The zarr content inside the repository is either: + +- **Flat** — a single-resolution store with dimensions `(time, x, y)` +- **Pyramid** — a multi-resolution store with levels `0/`, `1/`, `2/`, … where `0/` is the full resolution + +The flat vs. pyramid decision is made at ingest time based on spatial size (see [Multiscale pyramids](#multiscale-pyramids) below). + +--- + ## Chunk sizing Chunks are sized to match expected access patterns. The goal is that reading one time step for the full spatial extent fits in one round-trip, and that full time series for a small area also fits in one round-trip.