diff --git a/AGENTS.md b/AGENTS.md index e34e7519..b80e6adb 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -8,7 +8,7 @@ The DHIS2 Climate API is a FastAPI-based REST API that downloads, processes, and Key concepts: -- **Dataset templates** — YAML files in `data/datasets/` describing a data source (variable, period type, download function). These are blueprints. +- **Dataset templates** — YAML files in `data/datasets/` describing a data source (variable, period type, ingestion plugin). These are blueprints. - **Artifacts / managed datasets** — ingested instances of a template for a specific spatial extent and time range. Exposed under `/datasets` and `/zarr/{dataset_id}`. - **Extent** — a single named spatial bounding box configured at instance setup time (`id`, `bbox`, optional `country_code`). Exposed at `GET /extent`. - **GeoZarr stores** — datasets are stored as chunked Zarr v3 archives with GeoZarr spatial attributes. Flat stores for small extents; multiscale pyramids for large ones. Served chunk-by-chunk over HTTP with no specialised server middleware. @@ -44,18 +44,17 @@ The `.env` file is required for `make run` and `make openapi`. Copy `.env.exampl ## Dataset templates -Each YAML in `data/datasets/` defines a dataset template. The `ingestion` block controls download and zarr build behaviour: +Each YAML in `data/datasets/` defines a dataset template. The `ingestion` block specifies the plugin class that streams data directly into the Icechunk store: ```yaml ingestion: - function: dhis2eo.data.worldpop.pop_total.yearly.download - default_params: {} # passed to the download function + plugin: climate_api.ingest.plugins.worldpop.WorldPopPlugin + params: + version: global2 ``` `build_dataset_zarr` in `data_manager/downloader.py` builds a multiscale Zarr pyramid when the spatial dimensions exceed 2048×2048 pixels; otherwise it writes a flat chunked zarr with chunk sizes derived from the dataset's temporal resolution. -The ingestion interface is being redesigned as a plugin protocol (see GitHub issue #64) — the `ingestion.function` convention will be replaced by a three-method async plugin (`probe`, `periods`, `fetch_period`). - ## pygeoapi pygeoapi is mounted at `/ogcapi` as a sub-application. Its config is generated dynamically from published artifacts by `publications/services.py` and written to `data/pygeoapi/pygeoapi-config.yml`. diff --git a/climate_api/client.py b/climate_api/client.py index 7b820e41..08318104 100644 --- a/climate_api/client.py +++ b/climate_api/client.py @@ -1,19 +1,13 @@ """Lightweight client for discovering and opening published Climate API datasets.""" -import os from urllib.parse import urlparse import httpx import xarray as xr -_FALLBACK_BASE_URL = "http://127.0.0.1:8000" _DEFAULT_TIMEOUT = 30.0 -def _default_base_url() -> str: - return os.environ.get("CLIMATE_API_BASE_URL", _FALLBACK_BASE_URL) - - def _id_from_href(href: str) -> str: """Extract the dataset id from a STAC child href by reading the last URL path segment.""" return urlparse(href).path.rstrip("/").rsplit("/", 1)[-1] @@ -71,8 +65,8 @@ def open(self, dataset_id: str) -> xr.Dataset: """Open a published dataset as an xarray Dataset. Fetches the STAC collection for ``dataset_id``, reads the Zarr asset - metadata, and returns the opened dataset. Coordinates are always - ``time``, ``latitude``, and ``longitude``. + metadata, and returns the opened dataset. Spatial dimensions are always + named ``x`` and ``y``; the time dimension, when present, is ``time``. """ response = self._http.get(f"{self.base_url}/stac/collections/{dataset_id}") response.raise_for_status() @@ -92,55 +86,3 @@ def open(self, dataset_id: str) -> xr.Dataset: if not isinstance(open_kwargs, dict): raise ValueError(f"Zarr asset for '{dataset_id}' has a malformed xarray:open_kwargs field") return xr.open_zarr(href, **open_kwargs) # type: ignore[no-any-return] - - -def list_datasets(base_url: str | None = None) -> list[dict]: - """Return all published datasets from the STAC catalog. - - Each entry is a STAC child link dict with at least ``id``, ``title``, and ``href``. - ``base_url`` defaults to the ``CLIMATE_API_BASE_URL`` environment variable, - falling back to ``http://127.0.0.1:8000``. - """ - url = (base_url or _default_base_url()).rstrip("/") - response = httpx.get(f"{url}/stac/catalog.json", timeout=_DEFAULT_TIMEOUT) - response.raise_for_status() - catalog = response.json() - raw_links = catalog.get("links") - if not isinstance(raw_links, list): - raise ValueError(f"Invalid STAC catalog response from {url}: missing or non-list 'links' field") - links = [] - for link in raw_links: - if isinstance(link, dict) and link.get("rel") == "child": - href = link.get("href") - if not isinstance(href, str) or not href: - raise ValueError(f"STAC child link from {url} has a missing or invalid href") - links.append({**link, "id": _id_from_href(href)}) - return links - - -def open_dataset(dataset_id: str, *, base_url: str | None = None) -> xr.Dataset: - """Open a published dataset as an xarray Dataset. - - Fetches the STAC collection for ``dataset_id``, reads the Zarr asset - metadata, and returns the opened dataset. Coordinates are always - ``time``, ``latitude``, and ``longitude``. - ``base_url`` defaults to the ``CLIMATE_API_BASE_URL`` environment variable, - falling back to ``http://127.0.0.1:8000``. - """ - url = (base_url or _default_base_url()).rstrip("/") - response = httpx.get(f"{url}/stac/collections/{dataset_id}", timeout=_DEFAULT_TIMEOUT) - response.raise_for_status() - collection = response.json() - assets = collection.get("assets") - if not isinstance(assets, dict): - raise ValueError(f"STAC collection for '{dataset_id}' from {url} has a missing or invalid 'assets' field") - asset = assets.get("zarr") - if not isinstance(asset, dict): - raise ValueError(f"Dataset '{dataset_id}' has no Zarr asset in the STAC collection") - href = asset.get("href") - if not isinstance(href, str) or not href: - raise ValueError(f"Zarr asset for '{dataset_id}' has a missing or invalid href") - open_kwargs = asset.get("xarray:open_kwargs", {}) - if not isinstance(open_kwargs, dict): - raise ValueError(f"Zarr asset for '{dataset_id}' has a malformed xarray:open_kwargs field") - return xr.open_zarr(href, **open_kwargs) # type: ignore[no-any-return] diff --git a/climate_api/data/datasets/chirps3.yaml b/climate_api/data/datasets/chirps3.yaml index a7a13a8c..c1218e6b 100644 --- a/climate_api/data/datasets/chirps3.yaml +++ b/climate_api/data/datasets/chirps3.yaml @@ -6,8 +6,6 @@ sync: kind: temporal execution: append - availability: - latest_available_function: climate_api.providers.availability.chirps3_daily_latest_available extents: spatial: bbox: [-180, -50, 180, 50] @@ -15,7 +13,10 @@ begin: "1981-01-01" resolution: P1D ingestion: - function: dhis2eo.data.chc.chirps3.daily.download + plugin: climate_api.ingest.plugins.chirps3.Chirps3Plugin + params: + stage: final + flavor: rnl units: mm resolution: 5 km x 5 km source: CHIRPS v3 diff --git a/climate_api/data/datasets/era5_land.yaml b/climate_api/data/datasets/era5_land.yaml index 14708404..58ca9e00 100644 --- a/climate_api/data/datasets/era5_land.yaml +++ b/climate_api/data/datasets/era5_land.yaml @@ -6,9 +6,6 @@ sync: kind: temporal execution: append - availability: - latest_available_function: climate_api.providers.availability.lagged_latest_available - lag_hours: 120 extents: spatial: bbox: [-180, -90, 180, 90] @@ -16,9 +13,9 @@ begin: "1950-01-01" resolution: PT1H ingestion: - function: dhis2eo.data.destine.era5_land.hourly.download - default_params: - variables: ['t2m'] + plugin: climate_api.ingest.plugins.era5_land.Era5LandPlugin + params: + variable: t2m transforms: - climate_api.transforms.kelvin_to_celsius units: degC @@ -37,9 +34,6 @@ sync: kind: temporal execution: append - availability: - latest_available_function: climate_api.providers.availability.lagged_latest_available - lag_hours: 120 extents: spatial: bbox: [-180, -90, 180, 90] @@ -47,9 +41,9 @@ begin: "1950-01-01" resolution: PT1H ingestion: - function: dhis2eo.data.destine.era5_land.hourly.download - default_params: - variables: ['tp'] + plugin: climate_api.ingest.plugins.era5_land.Era5LandPlugin + params: + variable: tp transforms: - climate_api.transforms.metres_to_mm units: mm diff --git a/climate_api/data/datasets/worldpop.yaml b/climate_api/data/datasets/worldpop.yaml index b1d6aa9a..448fbdc6 100644 --- a/climate_api/data/datasets/worldpop.yaml +++ b/climate_api/data/datasets/worldpop.yaml @@ -5,10 +5,6 @@ period_type: yearly sync: kind: release - availability: - latest_available_function: climate_api.providers.availability.worldpop_release_latest_available - # WorldPop projections are intentionally request-driven for future years. - allow_future: true extents: spatial: bbox: [-180, -90, 180, 90] @@ -17,8 +13,9 @@ end: "2030" resolution: P1Y ingestion: - function: dhis2eo.data.worldpop.pop_total.yearly.download - default_params: + plugin: climate_api.ingest.plugins.worldpop.WorldPopPlugin + params: + # country_code is injected automatically from extent.country_code in climate-api.yaml version: global2 units: people resolution: 100m x 100m @@ -27,4 +24,3 @@ display: colormap: reds range: [0.0, 25.0] - nodata: 0.0 diff --git a/climate_api/data_accessor/services/accessor.py b/climate_api/data_accessor/services/accessor.py index b3a31b92..6fbdd0e4 100644 --- a/climate_api/data_accessor/services/accessor.py +++ b/climate_api/data_accessor/services/accessor.py @@ -3,13 +3,14 @@ import logging import os import tempfile +from pathlib import Path from typing import Any import numpy as np import xarray as xr from pyproj import Transformer -from ...data_manager.services.downloader import get_cache_files, get_zarr_path +from ...data_manager.services.downloader import get_icechunk_path from ...data_manager.services.utils import get_time_dim, get_x_y_dims from ...shared.time import numpy_datetime_to_period_string @@ -24,21 +25,8 @@ def get_data( ) -> xr.Dataset: """Load an xarray raster dataset for a given time range and bbox.""" logger.info("Opening dataset") - zarr_path = get_zarr_path(dataset) - if zarr_path: - logger.info(f"Using optimized zarr file: {zarr_path}") - ds = open_zarr_dataset(str(zarr_path)) - else: - logger.warning( - f"Could not find optimized zarr file for dataset {dataset['id']}, using slower netcdf files instead." - ) - files = get_cache_files(dataset) - ds = xr.open_mfdataset( - files, - data_vars="minimal", - coords="minimal", # pyright: ignore[reportArgumentType] - compat="override", - ) + store_path = get_icechunk_path(dataset) + ds = open_icechunk_dataset(store_path) if start and end: logger.info(f"Subsetting time to {start} and {end}") @@ -72,25 +60,10 @@ def get_data_coverage(dataset: dict[str, Any]) -> dict[str, Any]: def get_data_coverage_for_paths( dataset: dict[str, Any], *, - zarr_path: str | None = None, - netcdf_paths: list[str] | None = None, + zarr_path: str, ) -> dict[str, Any]: - """Return coverage metadata for the concrete files created for one artifact.""" - if zarr_path is not None and netcdf_paths: - raise ValueError("Provide either zarr_path or netcdf_paths when computing coverage, not both") - if zarr_path is None and not netcdf_paths: - raise ValueError("Coverage calculation requires either zarr_path or at least one netcdf path") - - if zarr_path is not None: - ds = open_zarr_dataset(zarr_path) - else: - assert netcdf_paths is not None - ds = xr.open_mfdataset( - netcdf_paths, - data_vars="minimal", - coords="minimal", # pyright: ignore[reportArgumentType] - compat="override", - ) + """Return coverage metadata for a materialized flat-zarr artifact.""" + ds = open_zarr_dataset(zarr_path) from climate_api import config as api_config @@ -124,11 +97,40 @@ def open_zarr_dataset(zarr_path: str) -> xr.Dataset: return ds +def open_icechunk_dataset(store_path: str | Path) -> xr.Dataset: + """Open an Icechunk store as an xarray Dataset via a readonly MVCC session. + + Detects multiscale pyramid stores (root group has ``multiscales`` in attrs) + and opens group ``0`` (full resolution) in that case. + """ + import icechunk + import zarr + + path = Path(store_path) + if not path.exists(): + raise FileNotFoundError(f"Icechunk store not found: {path}") + storage = icechunk.local_filesystem_storage(str(path)) + repo = icechunk.Repository.open(storage) + session = repo.readonly_session("main") + root = zarr.open_group(session.store, mode="r") + group: str | None = "0" if "multiscales" in root.attrs else None + return xr.open_zarr(session.store, group=group) # type: ignore[no-any-return] + + def _open_zarr(zarr_path: str) -> xr.Dataset: """Open a zarr store with automatic consolidated metadata detection.""" return xr.open_zarr(zarr_path, consolidated=None) # type: ignore[no-any-return] +def coverage_from_open_dataset(ds: xr.Dataset, *, period_type: str, native_crs: str = "EPSG:4326") -> dict[str, Any]: + """Summarize temporal and spatial coverage for a caller-managed open dataset. + + Unlike get_data_coverage_for_paths, this function does not close the dataset. + Use when the caller already holds a store handle (e.g. an Icechunk session store). + """ + return _coverage_from_dataset(ds=ds, period_type=period_type, native_crs=native_crs) + + def _coverage_from_dataset(*, ds: xr.Dataset, period_type: str, native_crs: str = "EPSG:4326") -> dict[str, Any]: """Summarize temporal and spatial coverage for an already opened dataset.""" if any(size == 0 for size in ds.sizes.values()): @@ -141,11 +143,16 @@ def _coverage_from_dataset(*, ds: xr.Dataset, period_type: str, native_crs: str }, } - time_dim = get_time_dim(ds) x_dim, y_dim = get_x_y_dims(ds) - start = _period_string_scalar(numpy_datetime_to_period_string(ds[time_dim].min(), period_type)) # type: ignore[arg-type] - end = _period_string_scalar(numpy_datetime_to_period_string(ds[time_dim].max(), period_type)) # type: ignore[arg-type] + start: str | None + end: str | None + try: + time_dim = get_time_dim(ds) + start = _period_string_scalar(numpy_datetime_to_period_string(ds[time_dim].min(), period_type)) # type: ignore[arg-type] + end = _period_string_scalar(numpy_datetime_to_period_string(ds[time_dim].max(), period_type)) # type: ignore[arg-type] + except ValueError: + start = end = None xmin, xmax = ds[x_dim].min().item(), ds[x_dim].max().item() ymin, ymax = ds[y_dim].min().item(), ds[y_dim].max().item() diff --git a/climate_api/data_manager/__init__.py b/climate_api/data_manager/__init__.py index 186a758f..1331b20f 100644 --- a/climate_api/data_manager/__init__.py +++ b/climate_api/data_manager/__init__.py @@ -1,4 +1,3 @@ """Data manager package.""" -from . import routes as routes from . import services as services diff --git a/climate_api/data_manager/routes.py b/climate_api/data_manager/routes.py deleted file mode 100644 index ebd6703a..00000000 --- a/climate_api/data_manager/routes.py +++ /dev/null @@ -1,51 +0,0 @@ -"""FastAPI router exposing dataset endpoints.""" - -from fastapi import APIRouter, BackgroundTasks - -from ..data_registry.routes import _get_dataset_or_404 -from .services import downloader - -router = APIRouter() - - -@router.get( - "/{dataset_id}/download", - response_model=dict, - summary="Internal dataset download", - include_in_schema=False, -) -def download_dataset( - dataset_id: str, - start: str, - background_tasks: BackgroundTasks, - end: str | None = None, - overwrite: bool = False, -) -> dict[str, str]: - """Internal low-level cache download route kept for compatibility.""" - dataset = _get_dataset_or_404(dataset_id) - downloader.download_dataset( - dataset, - start=start, - end=end, - bbox=None, - country_code=None, - overwrite=overwrite, - background_tasks=background_tasks, - ) - return {"status": "Downloading data for dataset"} - - -@router.get( - "/{dataset_id}/build_zarr", - response_model=dict, - summary="Internal dataset Zarr build", - include_in_schema=False, -) -def build_dataset_zarr( - dataset_id: str, - background_tasks: BackgroundTasks, -) -> dict[str, str]: - """Internal low-level cache optimization route kept for compatibility.""" - dataset = _get_dataset_or_404(dataset_id) - background_tasks.add_task(downloader.build_dataset_zarr, dataset) - return {"status": "Building zarr file from dataset downloads"} diff --git a/climate_api/data_manager/services/downloader.py b/climate_api/data_manager/services/downloader.py index 276cd7b0..2dacfbea 100644 --- a/climate_api/data_manager/services/downloader.py +++ b/climate_api/data_manager/services/downloader.py @@ -1,26 +1,15 @@ -"""Dataset cache: download, store, and optimize raster data as local files.""" +"""Dataset cache: utilities for locating and reading downloaded raster files.""" -import datetime import importlib -import inspect import logging import os -import shutil from collections.abc import Callable from pathlib import Path from typing import Any import xarray as xr -import xproj # noqa: F401 # type: ignore[import-untyped] # pyright: ignore[reportUnusedImport] -from fastapi import BackgroundTasks, HTTPException -from geozarr_toolkit import MultiscalesConventionMetadata, create_geozarr_attrs -from topozarr.coarsen import create_pyramid from climate_api import config as api_config -from climate_api.shared.time import resolve_iso_period_step, time_chunk_for_iso_step -from climate_api.transforms.reproject import reproject_to_instance_crs - -from .utils import get_time_dim, get_x_y_dims logger = logging.getLogger(__name__) @@ -36,233 +25,6 @@ def _resolve_download_dir() -> Path: DOWNLOAD_DIR = _resolve_download_dir() -def download_dataset( - dataset: dict[str, Any], - start: str, - end: str | None, - bbox: list[float] | None, - country_code: str | None, - overwrite: bool, - background_tasks: BackgroundTasks | None, -) -> list[Path]: - """Download dataset files and return the NetCDF paths created or modified by this run. - - The download still happens primarily through side effects in the provider function. - This return value is used to identify the concrete files created for this invocation. - When running in the background-task path, the download is deferred and this function - returns an empty list because no files have been created yet. - """ - _validate_spatial_coverage(dataset, bbox if bbox is not None else _bbox_from_env()) - ingestion = dataset["ingestion"] - eo_download_func_path = ingestion["function"] - eo_download_func = _get_dynamic_function(eo_download_func_path) - before_files = {path.resolve(): path.stat().st_mtime_ns for path in get_cache_files(dataset)} - - params = dict(ingestion.get("default_params", {})) - params.update( - { - "start": start, - "end": end or datetime.date.today().isoformat(), - "dirname": DOWNLOAD_DIR, - "prefix": _get_cache_prefix(dataset), - "overwrite": overwrite, - } - ) - - sig = inspect.signature(eo_download_func) - try: - if "bbox" in sig.parameters: - params["bbox"] = _resolve_bbox(bbox=bbox) - if "country_code" in sig.parameters: - resolved_country_code = country_code or os.getenv("COUNTRY_CODE") - if resolved_country_code: - params["country_code"] = resolved_country_code - else: - raise HTTPException( - status_code=400, - detail=( - "Downloading this dataset requires a country code. " - "Provide it through the resolved extent configuration or set COUNTRY_CODE in the environment." - ), - ) - except HTTPException: - raise - except ValueError as exc: - raise HTTPException(status_code=400, detail=str(exc)) from exc - - if background_tasks is not None: - background_tasks.add_task(eo_download_func, **params) - return [] - - try: - eo_download_func(**params) - except HTTPException: - raise - except ValueError as exc: - raise HTTPException(status_code=400, detail=str(exc)) from exc - except Exception as exc: - message = str(exc).strip() or "Unexpected error from upstream data provider" - raise HTTPException(status_code=502, detail=f"Upstream dataset download failed: {message}") from exc - - after_files = [path.resolve() for path in get_cache_files(dataset)] - changed_files = [ - path for path in after_files if path not in before_files or path.stat().st_mtime_ns != before_files[path] - ] - return changed_files - - -def build_dataset_zarr(dataset: dict[str, Any], *, start: str | None = None, end: str | None = None) -> None: - """Collect dataset cache files into one optimised Zarr archive, clipped to request scope.""" - logger.info(f"Optimizing cache for dataset {dataset['id']}") - - files = get_cache_files(dataset) - logger.info(f"Opening {len(files)} files from cache") - ds = xr.open_mfdataset(files, parallel=False) - - x_dim, y_dim = get_x_y_dims(ds) - dims = [x_dim, y_dim] - - # trim to only minimal vars and coords before loading into memory - logger.info("Trimming unnecessary variables and coordinates") - varname = dataset["variable"] - ds = ds[[varname]] - keep_coords = [get_time_dim(ds)] + dims - drop_coords = [c for c in ds.coords if c not in keep_coords] - ds = ds.drop_vars(drop_coords) - - # Normalise to canonical names so all stored Zarr files are consistent. - crs = api_config.get_crs() - time_dim = get_time_dim(ds) - rename_map = {k: v for k, v in [(time_dim, "time"), (x_dim, "x"), (y_dim, "y")] if k != v} - if rename_map: - ds = ds.rename(rename_map) - x_dim, y_dim = "x", "y" - dims = [x_dim, y_dim] - - ds = _select_time_range(ds, dataset=dataset, start=start, end=end) - ds = _run_transforms(ds, dataset) - - source_crs: str = dataset.get("source_crs", "EPSG:4326") - ds = reproject_to_instance_crs(ds, dataset, source_crs=source_crs) - - xmin = ds[x_dim].min().item() - xmax = ds[x_dim].max().item() - ymin = ds[y_dim].min().item() - ymax = ds[y_dim].max().item() - bbox = [xmin, ymin, xmax, ymax] - shape = (ds.sizes[x_dim], ds.sizes[y_dim]) - - # https://github.com/zarr-developers/geozarr-toolkit/issues/15 - geozarr_attrs = create_geozarr_attrs( - dimensions=dims, - crs=crs, - bbox=bbox, - shape=shape, - ) - - # save as zarr - logger.info("Saving to optimized zarr file") - zarr_path = DOWNLOAD_DIR / f"{_get_cache_prefix(dataset)}.zarr" - - if _needs_pyramid(ds, x_dim, y_dim): - levels = _pyramid_levels(ds, x_dim, y_dim) - logger.info("Building %d-level pyramid (max dim %d pixels)", levels, max(ds.sizes[x_dim], ds.sizes[y_dim])) - - # Add multiscales convention metadata to the zarr attributes - zarr_conventions = geozarr_attrs.get("zarr_conventions", []) - zarr_conventions.append(MultiscalesConventionMetadata().model_dump()) - geozarr_attrs["zarr_conventions"] = zarr_conventions - - # Load into memory then close to deterministically release netCDF file handles - # before create_pyramid spawns multiprocessing workers. After load() the data - # lives in numpy arrays and no longer needs the underlying file objects. - ds.load() - ds.close() - - ds = ds.proj.assign_crs(spatial_ref=crs) - - # https://github.com/carbonplan/topozarr/issues/13 - pyramid = create_pyramid(ds, levels=levels, x_dim=x_dim, y_dim=y_dim, method="mean") - - pyramid.dt.attrs.update(geozarr_attrs) - pyramid.dt.to_zarr(zarr_path, mode="w", encoding=pyramid.encoding, zarr_format=3) - - # zarr-layer looks for the time coordinate at the root of the store, not inside each level. - # Copy it from level 0 so browser clients can discover it without knowing the level structure. - time_dim = get_time_dim(ds) - time_src = zarr_path / "0" / time_dim - time_dst = zarr_path / time_dim - if time_src.exists(): - if time_dst.exists(): - shutil.rmtree(time_dst) - shutil.copytree(time_src, time_dst) - - pyramid.dt.close() - - else: - logger.info("Building flat zarr (max dim %d pixels)", max(ds.sizes[x_dim], ds.sizes[y_dim])) - uniform_chunks = _compute_time_space_chunks(ds, dataset) - logger.info(f"--> {uniform_chunks}") - - ds.attrs.update(geozarr_attrs) - ds_chunked = ds.chunk(uniform_chunks) - # Remove _FillValue from each variable's encoding so that in-memory NaN values - # are stored as IEEE NaN in zarr rather than re-encoded as a sentinel (e.g. - # -999.99). ZarrLayer uses the zarr fill_value attribute (nan for floats) to - # render missing pixels as transparent — not a separately specified fillValue. - for var in ds_chunked.data_vars: - ds_chunked[var].encoding.pop("_FillValue", None) - ds_chunked.to_zarr(zarr_path, mode="w", zarr_format=3, consolidated=True) - ds_chunked.close() - - ds.close() - logger.info("Finished cache optimization") - - -_PYRAMID_PIXEL_THRESHOLD = 2048 * 2048 -_PYRAMID_MAX_LEVELS = 8 -_PYRAMID_TARGET_TILE_SIZE = 512 - - -def _needs_pyramid(ds: xr.Dataset, x_dim: str, y_dim: str) -> bool: - """Return True when the spatial extent is large enough to benefit from a pyramid.""" - return ds.sizes[x_dim] * ds.sizes[y_dim] > _PYRAMID_PIXEL_THRESHOLD - - -def _pyramid_levels(ds: xr.Dataset, x_dim: str, y_dim: str) -> int: - """Compute the number of pyramid levels needed to reach a manageable tile size.""" - import math - - max_dim = max(ds.sizes[x_dim], ds.sizes[y_dim]) - levels = math.ceil(math.log2(max_dim / _PYRAMID_TARGET_TILE_SIZE)) - return max(2, min(levels, _PYRAMID_MAX_LEVELS)) - - -def _select_time_range( - ds: xr.Dataset, - *, - dataset: dict[str, Any], - start: str | None, - end: str | None, -) -> xr.Dataset: - """Clip a cached dataset to the managed artifact's requested temporal scope.""" - if start is None and end is None: - return ds - - time_dim = get_time_dim(ds) - selected = ds.sel({time_dim: slice(start, end)}) - if selected.sizes.get(time_dim, 0) == 0: - raise ValueError(f"No cached data for dataset '{dataset['id']}' intersects requested time range {start}..{end}") - logger.info( - "Clipped dataset '%s' to requested time range %s..%s (%d steps)", - dataset["id"], - start, - end, - selected.sizes[time_dim], - ) - return selected - - def _run_transforms(ds: xr.Dataset, dataset: dict[str, Any]) -> xr.Dataset: dataset_id = dataset.get("id", "?") for entry in dataset.get("transforms", []): @@ -286,92 +48,13 @@ def _run_transforms(ds: xr.Dataset, dataset: dict[str, Any]) -> xr.Dataset: return ds -def _compute_time_space_chunks( - ds: xr.Dataset, - dataset: dict[str, Any], - max_spatial_chunk: int = 512, -) -> dict[str, int]: - """Compute chunk sizes tuned for common temporal access patterns.""" - chunks: dict[str, int] = {} - - iso_step = resolve_iso_period_step(dataset) - dim = get_time_dim(ds) - if iso_step is not None: - try: - chunks[dim] = time_chunk_for_iso_step(iso_step) - except ValueError: - logger.warning( - "Invalid ISO 8601 step %r for dataset '%s'; defaulting time chunk to 12.", - iso_step, - dataset.get("id", "?"), - ) - chunks[dim] = 12 - else: - logger.warning( - "No ISO 8601 step for dataset '%s'; defaulting time chunk to 12. " - "Declare 'extents.temporal.resolution' in the template to silence this warning.", - dataset.get("id", "?"), - ) - chunks[dim] = 12 - - x_dim, y_dim = get_x_y_dims(ds) - chunks[x_dim] = min(ds.sizes[x_dim], max_spatial_chunk) - chunks[y_dim] = min(ds.sizes[y_dim], max_spatial_chunk) - - return chunks - - def _get_cache_prefix(dataset: dict[str, Any]) -> str: return str(dataset["id"]) -def get_cache_files(dataset: dict[str, Any]) -> list[Path]: - """Return all NetCDF cache files matching this dataset's prefix.""" - # TODO: not bulletproof -- e.g. 2m_temperature matches 2m_temperature_modified - prefix = _get_cache_prefix(dataset) - return list(DOWNLOAD_DIR.glob(f"{prefix}*.nc")) - - -def get_zarr_path(dataset: dict[str, Any]) -> Path | None: - """Return the optimised zarr archive path if it exists.""" - prefix = _get_cache_prefix(dataset) - optimized = DOWNLOAD_DIR / f"{prefix}.zarr" - if optimized.exists(): - return optimized - return None - - -def _validate_spatial_coverage(dataset: dict[str, Any], bbox: list[float] | None) -> None: - """Raise HTTP 400 if the request bbox falls outside the dataset's declared extents.""" - extents = dataset.get("extents") - if not extents or bbox is None: - return - spatial = extents.get("spatial") - if not spatial: - return - cov_bbox = spatial.get("bbox") - if not isinstance(cov_bbox, (list, tuple)) or len(cov_bbox) != 4: - return - cov_xmin, cov_ymin, cov_xmax, cov_ymax = cov_bbox - xmin, ymin, xmax, ymax = bbox - if ymin > cov_ymax or ymax < cov_ymin: - raise HTTPException( - status_code=400, - detail=( - f"Dataset '{dataset['id']}' does not cover this extent. " - f"Latitude coverage: {cov_ymin}°–{cov_ymax}°, " - f"requested: {ymin}°–{ymax}°." - ), - ) - if xmin > cov_xmax or xmax < cov_xmin: - raise HTTPException( - status_code=400, - detail=( - f"Dataset '{dataset['id']}' does not cover this extent. " - f"Longitude coverage: {cov_xmin}°–{cov_xmax}°, " - f"requested: {xmin}°–{xmax}°." - ), - ) +def get_icechunk_path(dataset: dict[str, Any]) -> Path: + """Return the Icechunk store path for a dataset (may not exist yet).""" + return DOWNLOAD_DIR / f"{_get_cache_prefix(dataset)}.icechunk" def _get_dynamic_function(full_path: str) -> Callable[..., Any]: @@ -381,29 +64,3 @@ def _get_dynamic_function(full_path: str) -> Callable[..., Any]: function_name = parts[-1] module = importlib.import_module(module_path) return getattr(module, function_name) # type: ignore[no-any-return] - - -def _resolve_bbox(*, bbox: list[float] | None) -> list[float]: - """Resolve bbox from request or environment.""" - if bbox is not None: - return bbox - - env_bbox = _bbox_from_env() - if env_bbox is not None: - return env_bbox - - raise ValueError( - "A bbox is required for this dataset. Provide it in the request or set DOWNLOAD_BBOX in the environment." - ) - - -def _bbox_from_env() -> list[float] | None: - """Parse a default bbox from environment if configured.""" - raw_bbox = os.getenv("DOWNLOAD_BBOX") or os.getenv("DEFAULT_DOWNLOAD_BBOX") - if not raw_bbox: - return None - - parts = [part.strip() for part in raw_bbox.split(",")] - if len(parts) != 4: - raise ValueError("DOWNLOAD_BBOX must contain four comma-separated numbers: xmin,ymin,xmax,ymax") - return [float(part) for part in parts] diff --git a/climate_api/data_manager/services/utils.py b/climate_api/data_manager/services/utils.py index f4e74dee..721febe7 100644 --- a/climate_api/data_manager/services/utils.py +++ b/climate_api/data_manager/services/utils.py @@ -5,8 +5,9 @@ def get_time_dim(ds: Any) -> str: """Return the name of the time dimension in a dataset or dataframe.""" + actual_dims: set[str] = set(getattr(ds, "dims", {}) or {}) for time_name in ["valid_time", "time"]: - if hasattr(ds, time_name): + if time_name in actual_dims: return time_name raise ValueError(f"Unable to find time dimension: {getattr(ds, 'coords', repr(ds))}") diff --git a/climate_api/data_registry/services/datasets.py b/climate_api/data_registry/services/datasets.py index a0487f48..1a5aea61 100644 --- a/climate_api/data_registry/services/datasets.py +++ b/climate_api/data_registry/services/datasets.py @@ -152,24 +152,6 @@ def _validate_dataset_template(dataset: object, *, source: str) -> None: ingestion = dataset.get("ingestion") if not isinstance(ingestion, dict): raise ValueError(f"Dataset template '{dataset_id}' in {source} must define an 'ingestion' block") - function = ingestion.get("function") - if not isinstance(function, str) or not function: - raise ValueError(f"Dataset template '{dataset_id}' in {source} must define ingestion.function") - - sync_availability = sync_block.get("availability") if isinstance(sync_block, dict) else None - if sync_availability is not None: - _validate_sync_availability(sync_availability, dataset_id=dataset_id, source=source) - - -def _validate_sync_availability(sync_availability: object, *, dataset_id: str, source: str) -> None: - """Validate optional source availability policy metadata.""" - if not isinstance(sync_availability, dict): - raise ValueError(f"Dataset template '{dataset_id}' in {source} has invalid sync.availability") - - latest_available_function = sync_availability.get("latest_available_function") - if latest_available_function is not None and ( - not isinstance(latest_available_function, str) or not latest_available_function - ): - raise ValueError( - f"Dataset template '{dataset_id}' in {source} has invalid sync.availability.latest_available_function" - ) + plugin = ingestion.get("plugin") + if not (isinstance(plugin, str) and plugin): + raise ValueError(f"Dataset template '{dataset_id}' in {source} must define ingestion.plugin") diff --git a/climate_api/ingest/__init__.py b/climate_api/ingest/__init__.py new file mode 100644 index 00000000..f79c90ce --- /dev/null +++ b/climate_api/ingest/__init__.py @@ -0,0 +1,6 @@ +"""Per-period Icechunk ingest — protocol, orchestrator, and built-in plugins.""" + +from climate_api.ingest.orchestrator import run_ingest, run_ingest_sync +from climate_api.ingest.protocol import GridSpec, IngestionPlugin + +__all__ = ["GridSpec", "IngestionPlugin", "run_ingest", "run_ingest_sync"] diff --git a/climate_api/ingest/orchestrator.py b/climate_api/ingest/orchestrator.py new file mode 100644 index 00000000..4d2315f2 --- /dev/null +++ b/climate_api/ingest/orchestrator.py @@ -0,0 +1,279 @@ +"""Per-period Icechunk ingest orchestrator. + +The orchestrator is the only place that writes to the Icechunk store. +Plugins implement three focused sync methods (probe / periods / fetch_period) +and never touch zarr directly. The orchestrator runs probe and fetch_period +via asyncio.to_thread so I/O-bound plugins run in the thread pool without +managing their own executor; periods() is called directly (pure computation). + +Crash recovery: every period is committed individually. The cursor is saved +every commit_batch_size periods so that a restart resumes from the last +cursor checkpoint. A crash loses at most commit_batch_size periods of +re-fetch work (the store itself is always in a valid committed state). +""" + +from __future__ import annotations + +import asyncio +import importlib +import inspect +import logging +from collections.abc import Callable +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import xarray as xr + +from climate_api.ingest.protocol import GridSpec, IngestionPlugin +from climate_api.ingest.store import build_pyramid_store, open_or_create_repo, read_committed_period_ids, rechunk_store + +logger = logging.getLogger(__name__) + +_CF_ENCODING_KEYS = frozenset({"scale_factor", "add_offset", "missing_value", "_FillValue", "coordinates"}) + + +def _strip_cf_encoding(ds: xr.Dataset, period_type: str) -> None: + """Strip CF attrs and clear encoding to prevent zarr append conflicts. + + GeoTIFF-sourced arrays carry scale_factor/add_offset/_FillValue in both + .encoding and .attrs. xarray raises ValueError when appending to zarr if + those keys collide with the stored array metadata from a prior write. + """ + for name in list(ds.data_vars) + list(ds.coords): + ds[name].encoding.clear() + ds[name].attrs = {k: v for k, v in ds[name].attrs.items() if k not in _CF_ENCODING_KEYS} + if "time" in ds.coords: + units = "hours since 1970-01-01" if period_type == "hourly" else "days since 1970-01-01" + ds["time"].encoding.update({"units": units, "dtype": "int32"}) + + +def _write_geozarr_attrs(store: Any, *, spec: GridSpec, bbox: list[float]) -> None: + """Write GeoZarr root-level attributes to the store after the first mode='w' write.""" + import zarr + + root = zarr.open_group(store, mode="r+") + attrs: dict[str, Any] = { + "proj:code": f"EPSG:{spec.crs}", + "spatial:bbox": bbox, + } + attrs.update(spec.attrs) + root.attrs.update(attrs) + + +def load_plugin( + dotted_path: str, + params: dict[str, Any], + extra_params: dict[str, Any] | None = None, +) -> IngestionPlugin: + """Instantiate an IngestionPlugin from a dotted import path and YAML params. + + The class is imported from dotted_path and called with **params. Built-in + plugins accept variable and other source-specific kwargs; custom plugins + define their own __init__ signature. + + extra_params are merged into params only for keys that the constructor + declares and that are not already present in params. This is used to inject + instance-level config (e.g. country_code from the extent) without requiring + every plugin to accept it. + """ + module_path, _, class_name = dotted_path.rpartition(".") + if not module_path: + raise ValueError(f"Invalid plugin path '{dotted_path}': must be 'module.ClassName'") + module = importlib.import_module(module_path) + cls = getattr(module, class_name) + merged = dict(params) + if extra_params: + sig = inspect.signature(cls.__init__) + for key, value in extra_params.items(): + if key not in merged and key in sig.parameters: + merged[key] = value + plugin = cls(**merged) + if not isinstance(plugin, IngestionPlugin): + raise TypeError(f"{dotted_path} does not implement IngestionPlugin") + return plugin + + +async def run_ingest( + *, + plugin: IngestionPlugin, + params: dict[str, Any], + bbox: list[float], + start: str, + end: str, + store_path: Path, + period_type: str, + on_progress: Callable[..., None] | None = None, + is_cancel_requested: Callable[[], bool] | None = None, + save_cursor: Callable[[dict[str, Any]], None] | None = None, + rechunk_time: int | None = None, + apply_transforms: Callable[[xr.Dataset], xr.Dataset] | None = None, + pyramid: bool = False, +) -> None: + """Probe the source then stream per-period data into an Icechunk store. + + On the first run creates the store. On resume, reads committed period IDs + directly from the Icechunk store and ingests only the missing periods. + + Memory usage is bounded by plugin.max_concurrency datasets held in flight + concurrently. Writes are always sequential: tasks are awaited in + chronological order so the time axis stays sorted. + """ + spec: GridSpec = await asyncio.to_thread(plugin.probe, bbox, **params) + logger.info("Probe: shape=%s crs=EPSG:%d time_dim=%s", spec.shape, spec.crs, spec.time_dim) + + all_periods = plugin.periods(start, end) + if not all_periods: + logger.info("No periods available for range %s..%s", start, end) + return + + # Always use the store as ground truth — the job cursor is a checkpoint that + # lags behind actual Icechunk commits by up to commit_batch_size periods, so + # trusting it directly would re-fetch already-committed periods after a crash. + present = read_committed_period_ids(store_path, period_type) + pending = [p for p in all_periods if p not in present] + already_done = len(all_periods) - len(pending) + logger.info("Periods: %d already committed, %d pending", already_done, len(pending)) + + if not pending: + logger.info("Store is current — nothing to ingest") + return + + if on_progress: + on_progress(done=already_done, total=len(all_periods), message=f"{len(pending)} periods pending") + + # True when no periods have been committed yet — handles both a brand-new + # store and a store directory that exists as an empty skeleton from a + # previous failed initialisation (where append_dim would fail on an empty store). + is_first_write = already_done == 0 + # Capture before any commits so expire_snapshots only marks snapshots that + # were created during this run, not the pre-existing HEAD. + ingest_started_at = datetime.now(tz=timezone.utc) + repo = open_or_create_repo(store_path) + + semaphore = asyncio.Semaphore(plugin.max_concurrency) + + async def _fetch(period_id: str) -> xr.Dataset: + async with semaphore: + return await asyncio.to_thread(plugin.fetch_period, period_id, bbox, **params) + + # Create all tasks upfront so up to max_concurrency fetches start immediately. + # Await in chronological order so writes are always sequential. + tasks = [asyncio.create_task(_fetch(p)) for p in pending] + + try: + for i, task in enumerate(tasks): + if is_cancel_requested and is_cancel_requested(): + for t in tasks[i:]: + t.cancel() + from climate_api.jobs.models import JobCancelledError + + raise JobCancelledError("Ingest cancelled between periods") + + ds = await task + period_id = pending[i] + if apply_transforms is not None: + ds = apply_transforms(ds) + _strip_cf_encoding(ds, period_type=period_type) + + # Each period uses its own writable session so that to_zarr(append_dim=) + # on the next period reads the committed store and finds the time axis. + # Icechunk 2.x sessions do not expose uncommitted writes to subsequent + # zarr.open_group calls, so batching writes within one session breaks the + # append — committing per period is the correct pattern. + session = repo.writable_session("main") + + is_first_period_write = not spec.time_dim or (i == 0 and is_first_write) + if is_first_period_write: + ds.to_zarr(session.store, mode="w") + _write_geozarr_attrs(session.store, spec=spec, bbox=bbox) + else: + ds.to_zarr(session.store, append_dim="time") + + session.commit(f"ingest: {period_id}") + + # Save cursor at commit_batch_size intervals and at the end. + # commit_batch_size controls resume granularity (cursor save frequency), + # not commit frequency — every period is committed for correctness. + if save_cursor and ((i + 1) % plugin.commit_batch_size == 0 or (i + 1) == len(pending)): + save_cursor({"last_committed": period_id}) + logger.info("Cursor saved: up to %s (%d/%d)", period_id, i + 1, len(pending)) + + logger.debug("Committed: %s (%d/%d)", period_id, i + 1, len(pending)) + + if on_progress: + on_progress(done=already_done + i + 1, total=len(all_periods), message=f"Wrote {period_id}") + + if not spec.time_dim: + for t in tasks[i + 1 :]: + t.cancel() + break + except BaseException: + for t in tasks: + if not t.done(): + t.cancel() + raise + + if rechunk_time is not None and spec.time_dim: + logger.info("Rechunking %s after ingest: time chunk → %d", store_path, rechunk_time) + rechunk_store(store_path, time_chunk=rechunk_time) + repo = open_or_create_repo(store_path) + + if pyramid: + build_pyramid_store(store_path, x_dim=spec.x_dim, y_dim=spec.y_dim) + # Reopen repo so expire_snapshots sees the post-pyramid HEAD. + repo = open_or_create_repo(store_path) + + # Prune intermediate ingest snapshots: each period commit created one + # snapshot; only the final state (HEAD of "main") needs to be retained. + # expire_snapshots marks older snapshots as expired without deleting chunk + # data — garbage_collect would be needed to reclaim manifest storage. + # The "main" branch ref preserves HEAD even when it appears in the expired set. + try: + expired = repo.expire_snapshots(older_than=ingest_started_at) + if expired: + logger.info("Expired %d intermediate snapshots from %s", len(expired), store_path) + except Exception: + logger.warning("expire_snapshots failed for %s — store remains valid", store_path, exc_info=True) + + +def run_ingest_sync( + *, + plugin: IngestionPlugin, + params: dict[str, Any], + bbox: list[float], + start: str, + end: str, + store_path: Path, + period_type: str, + on_progress: Callable[..., None] | None = None, + is_cancel_requested: Callable[[], bool] | None = None, + save_cursor: Callable[[dict[str, Any]], None] | None = None, + rechunk_time: int | None = None, + apply_transforms: Callable[[xr.Dataset], xr.Dataset] | None = None, + pyramid: bool = False, +) -> None: + """Synchronous wrapper around run_ingest for use in threaded job workers. + + Must be called from a thread with no running event loop (e.g. a FastAPI + background task dispatched via the job framework's thread pool). + asyncio.run() creates a new event loop and will raise RuntimeError if one + is already running in the calling thread. + """ + asyncio.run( + run_ingest( + plugin=plugin, + params=params, + bbox=bbox, + start=start, + end=end, + store_path=store_path, + period_type=period_type, + on_progress=on_progress, + is_cancel_requested=is_cancel_requested, + save_cursor=save_cursor, + rechunk_time=rechunk_time, + apply_transforms=apply_transforms, + pyramid=pyramid, + ) + ) diff --git a/climate_api/ingest/plugins/__init__.py b/climate_api/ingest/plugins/__init__.py new file mode 100644 index 00000000..04686344 --- /dev/null +++ b/climate_api/ingest/plugins/__init__.py @@ -0,0 +1 @@ +"""Built-in IngestionPlugin implementations.""" diff --git a/climate_api/ingest/plugins/chirps3.py b/climate_api/ingest/plugins/chirps3.py new file mode 100644 index 00000000..456864f3 --- /dev/null +++ b/climate_api/ingest/plugins/chirps3.py @@ -0,0 +1,164 @@ +"""CHIRPS3 IngestionPlugin — daily precipitation from CHC servers. + +Authentication: none required (public COG files on data.chc.ucsb.edu). + +Daily COG files are fetched with HTTP range requests so only the bbox +window is downloaded per period. CHIRPS3 "final/rnl" data is released in +complete months; the plugin probes the CDN to find the actual latest month. + +URL layout (final): + https://data.chc.ucsb.edu/products/CHIRPS/v3.0/daily/final/{flavor}/cogs/ + {YYYY}/chirps-v3.0.{flavor}.{YYYY}.{MM}.{DD}.cog + +URL layout (prelim): + https://data.chc.ucsb.edu/products/CHIRPS/v3.0/daily/prelim/sat/ + {YYYY}/chirps-v3.0.prelim.{YYYY}.{MM}.{DD}.tif +""" + +from __future__ import annotations + +import calendar +import logging +import time +from datetime import date +from typing import Any + +import numpy as np +import xarray as xr + +from climate_api.ingest.protocol import GridSpec, enumerate_periods + +logger = logging.getLogger(__name__) + +_CHIRPS3_NODATA = -9999.0 +# CHIRPS3 resolution: 0.05° × 0.05° (~5 km at equator) +_CHIRPS3_RES_DEG = 0.05 + + +class Chirps3Plugin: + """IngestionPlugin for CHIRPS v3 daily precipitation. + + Args: + stage: Data maturity stage — 'final' (default, stable) or 'prelim' + (near-real-time, less reliable). Final data lags ~1–2 months. + flavor: File variant within the stage — 'rnl' or 'sat' for final, + 'sat' for prelim. Defaults to 'rnl' (final/rnl recommended). + """ + + max_concurrency = 1 + commit_batch_size = 30 + rechunk_time = 30 + + def __init__(self, stage: str = "final", flavor: str = "rnl") -> None: + if stage not in {"final", "prelim"}: + raise ValueError(f"stage must be 'final' or 'prelim', got {stage!r}") + if stage == "final" and flavor not in {"rnl", "sat"}: + raise ValueError(f"For stage='final', flavor must be 'rnl' or 'sat', got {flavor!r}") + if stage == "prelim" and flavor != "sat": + raise ValueError(f"For stage='prelim', flavor must be 'sat', got {flavor!r}") + self.stage = stage + self.flavor = flavor + + # ------------------------------------------------------------------ + # Protocol implementation + # ------------------------------------------------------------------ + + def probe(self, bbox: list[float], **_: Any) -> GridSpec: + """Derive GridSpec from CHIRPS3's known 0.05° resolution — no data transfer.""" + import math + + xmin, ymin, xmax, ymax = map(float, bbox) + nx = max(1, math.ceil((xmax - xmin) / _CHIRPS3_RES_DEG)) + ny = max(1, math.ceil((ymax - ymin) / _CHIRPS3_RES_DEG)) + return GridSpec( + shape=(ny, nx), + crs=4326, + dtype=np.dtype("float32"), + nodata=_CHIRPS3_NODATA, + time_dim=True, + ) + + def periods(self, start: str, end: str) -> list[str]: + return enumerate_periods(start, end, "daily", cutoff=self._availability_cutoff()) + + def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: + """Fetch one day via COG range request, clip to bbox, return as Dataset.""" + import rioxarray + + d = date.fromisoformat(period_id) + url = self._url_for_day(d) + logger.info("Fetching CHIRPS3 %s: %s", period_id, url) + + da = None + for attempt in range(3): + try: + da = rioxarray.open_rasterio(url, chunks=None, masked=True, lock=False) + break + except Exception as exc: + msg = str(exc) + if "429" in msg or "503" in msg: + wait = 10 * (2**attempt) + logger.warning("CHIRPS3 HTTP error (%s), retrying in %ds: %s", msg[:60], wait, url) + time.sleep(wait) + if attempt == 2: + raise + else: + raise + if not isinstance(da, xr.DataArray): + raise TypeError(f"rioxarray.open_rasterio returned {type(da).__name__!r}, expected DataArray") + xmin, ymin, xmax, ymax = map(float, bbox) + da = da.rio.clip_box(minx=xmin, miny=ymin, maxx=xmax, maxy=ymax) + da = da.squeeze("band", drop=True) + # Guard against files where the mask was not applied via metadata + da = da.where(da != _CHIRPS3_NODATA) + da = da.load() + + ds = da.to_dataset(name="precip") + return ds.expand_dims(time=[np.datetime64(period_id, "D")]) # type: ignore[no-any-return] + + # ------------------------------------------------------------------ + # URL construction and availability + # ------------------------------------------------------------------ + + def _url_for_day(self, d: date) -> str: + if self.stage == "final": + return ( + f"https://data.chc.ucsb.edu/products/CHIRPS/v3.0/daily/final/" + f"{self.flavor}/cogs/{d.year}/" + f"chirps-v3.0.{self.flavor}.{d.year}.{d.month:02d}.{d.day:02d}.cog" + ) + return ( + f"https://data.chc.ucsb.edu/products/CHIRPS/v3.0/daily/prelim/sat/" + f"{d.year}/chirps-v3.0.prelim.{d.year}.{d.month:02d}.{d.day:02d}.tif" + ) + + def _availability_cutoff(self) -> date: + """Return the last day of the most recently published complete month. + + Scans backward from the most recent months with a HEAD request on the + last-day COG URL so the cutoff reflects the actual CDN state rather than + a hardcoded lag assumption. + """ + import requests + + today = date.today() + y, m = today.year, today.month + for _ in range(6): + m -= 1 + if m == 0: + m, y = 12, y - 1 + last_day = calendar.monthrange(y, m)[1] + candidate = date(y, m, last_day) + try: + resp = requests.head(self._url_for_day(candidate), timeout=10, allow_redirects=True) + if resp.status_code == 200: + return candidate + except Exception: + continue + # Fallback: 3 months back (safe) + y, m = today.year, today.month + for _ in range(3): + m -= 1 + if m == 0: + m, y = 12, y - 1 + return date(y, m, calendar.monthrange(y, m)[1]) diff --git a/climate_api/ingest/plugins/era5_land.py b/climate_api/ingest/plugins/era5_land.py new file mode 100644 index 00000000..3e9a2d8a --- /dev/null +++ b/climate_api/ingest/plugins/era5_land.py @@ -0,0 +1,129 @@ +"""ERA5-Land IngestionPlugin — streams hourly data from DestinE Earth Data Hub. + +Authentication via .netrc (Unix) or _netrc (Windows). Register a free account +at https://earthdatahub.destine.eu/getting-started to obtain credentials. + +The DestinE ERA5-Land zarr store uses 0–360 longitudes (not −180–180). +This plugin corrects the longitude range before returning data so all stored +periods share a consistent coordinate system. +""" + +from __future__ import annotations + +import logging +import math +import threading +from datetime import date +from typing import Any + +import numpy as np +import xarray as xr + +from climate_api.ingest.protocol import GridSpec, enumerate_periods + +logger = logging.getLogger(__name__) + +_DESTINE_ZARR_URL = "https://data.earthdatahub.destine.eu/era5/reanalysis-era5-land-no-antartica-v0.zarr" +_STORAGE_OPTIONS = {"client_kwargs": {"trust_env": True}} + +# ERA5-Land native resolution: 0.1° × 0.1° (~9 km at equator). +_ERA5_LAND_RES_DEG = 0.1 + + +class Era5LandPlugin: + """IngestionPlugin for ERA5-Land hourly data from DestinE Earth Data Hub. + + Args: + variable: ERA5-Land variable short name (e.g. 't2m', 'tp'). + """ + + max_concurrency = 4 + commit_batch_size = 720 # one month of hourly periods + rechunk_time = 12 # group 12 hourly periods per chunk after initial ingest + + def __init__(self, variable: str) -> None: + self.variable = variable + self._cache_ds: xr.Dataset | None = None + self._cache_lock = threading.Lock() + + # ------------------------------------------------------------------ + # Protocol implementation + # ------------------------------------------------------------------ + + def probe(self, bbox: list[float], **_: Any) -> GridSpec: + """Derive GridSpec from ERA5-Land's known 0.1° resolution — no data transfer.""" + xmin, ymin, xmax, ymax = map(float, bbox) + # _select_bbox pads by one pixel in each direction so probe matches fetch shape. + nx = max(1, math.ceil((xmax - xmin + 2 * _ERA5_LAND_RES_DEG) / _ERA5_LAND_RES_DEG)) + ny = max(1, math.ceil((ymax - ymin + 2 * _ERA5_LAND_RES_DEG) / _ERA5_LAND_RES_DEG)) + return GridSpec( + shape=(ny, nx), + crs=4326, + dtype=np.dtype("float32"), + nodata=None, + time_dim=True, + x_dim="x", + y_dim="y", + ) + + def periods(self, start: str, end: str) -> list[str]: + """Return hourly period IDs up to the last timestamp published in the remote store.""" + latest = self._latest_available() + return enumerate_periods(start, min(end, latest), "hourly") + + def _latest_available(self) -> str: + """Read the last valid_time from the remote Zarr store (metadata only, no data loaded).""" + ds = xr.open_dataset( + _DESTINE_ZARR_URL, + engine="zarr", + storage_options=_STORAGE_OPTIONS, + chunks={}, + ) + return str(np.datetime64(ds.valid_time.values[-1], "h")) + + def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: + """Fetch one hourly period from the remote zarr store.""" + hour = int(period_id[-2:]) if len(period_id) > 10 else 0 + date_part = period_id[:10] + + if self._cache_ds is None: + with self._cache_lock: + if self._cache_ds is None: + logger.info("Opening ERA5-Land remote store: %s", _DESTINE_ZARR_URL) + self._cache_ds = self._correct_longitude(self._open_remote()) + ds = self._cache_ds + ds = self._select_bbox(ds, bbox).sel(valid_time=f"{date_part}T{hour:02d}") + + # Ensure a length-1 time dimension so append_dim="time" works correctly. + if "valid_time" in ds.dims: + ds = ds.rename({"valid_time": "time"}) + elif "valid_time" in ds.coords and "time" not in ds.dims: + ds = ds.expand_dims("time").assign_coords(time=[ds.valid_time.values]) + + ds = ds.rename({"longitude": "x", "latitude": "y"}) + ds = ds.load() + return ds + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + + def _open_remote(self) -> xr.Dataset: + return xr.open_dataset( + _DESTINE_ZARR_URL, + engine="zarr", + storage_options=_STORAGE_OPTIONS, + chunks={}, + )[[self.variable]] + + def _correct_longitude(self, ds: xr.Dataset) -> xr.Dataset: + """Unwrap 0–360 longitude to −180–180 and sort.""" + return ds.assign_coords(longitude=((ds.longitude + 180) % 360 - 180)).sortby("longitude") + + def _select_bbox(self, ds: xr.Dataset, bbox: list[float]) -> xr.Dataset: + xmin, ymin, xmax, ymax = map(float, bbox) + pad = _ERA5_LAND_RES_DEG + return ds.sel( + longitude=slice(xmin - pad, xmax + pad), + latitude=slice(ymax + pad, ymin - pad), + ) diff --git a/climate_api/ingest/plugins/worldpop.py b/climate_api/ingest/plugins/worldpop.py new file mode 100644 index 00000000..1780fdac --- /dev/null +++ b/climate_api/ingest/plugins/worldpop.py @@ -0,0 +1,130 @@ +"""WorldPop IngestionPlugin — yearly population count from WorldPop Global2. + +Authentication: none required (public files on data.worldpop.org). + +Files are per-country GeoTIFFs downloaded in full then clipped to bbox. +Global2 (R2025A) covers 2015–2030 at ~100m resolution (3 arc-seconds). +Global1 covers 2000–2020 at the same resolution (UN-adjusted unconstrained). + +The country_code constructor parameter must match the ISO 3166-1 alpha-3 +code used in WorldPop file names (e.g. 'NOR', 'GHA', 'KEN'). +""" + +from __future__ import annotations + +import io +import logging +import math +from typing import Any + +import numpy as np +import xarray as xr + +from climate_api.ingest.protocol import GridSpec + +logger = logging.getLogger(__name__) + +# WorldPop Global2 at 100m: 3 arc-seconds = 1/1200 degree per pixel +_WORLDPOP_RES_DEG = 1.0 / 1200 + + +class WorldPopPlugin: + """IngestionPlugin for WorldPop yearly population count data. + + Args: + country_code: ISO 3166-1 alpha-3 country code (e.g. 'NOR', 'GHA'). + Must match the casing used in WorldPop file paths (stored as + upper-case for directory names, lower-case for filenames). + version: Dataset version — 'global2' (2015–2030, default) or + 'global1' (2000–2020). + + Each country GeoTIFF is downloaded in full (~100–500 MB) and clipped to + the bbox in memory. max_concurrency=1 is required to bound peak memory. + + Sync behaviour: WorldPop stores are pyramid stores (pyramid=True). The + orchestrator cannot append to pyramid stores, so each sync triggers a + full rematerialization — all years are re-fetched and the pyramid is + rebuilt from scratch. This is expected and intentional. + """ + + max_concurrency = 1 + commit_batch_size = 1 + rechunk_time: int | None = None + pyramid: bool = True + + def __init__(self, country_code: str = "", version: str = "global2") -> None: + cc = country_code.strip().upper() + if not cc: + raise ValueError( + "WorldPopPlugin requires a 3-letter ISO country code (e.g. 'NOR'). " + "Set extent.country_code or ingestion.params.country_code." + ) + self.country_code = cc + self.version = version + + # ------------------------------------------------------------------ + # Protocol implementation + # ------------------------------------------------------------------ + + def probe(self, bbox: list[float], **_: Any) -> GridSpec: + """Derive GridSpec from WorldPop's known 3 arc-second resolution — no data transfer.""" + xmin, ymin, xmax, ymax = map(float, bbox) + nx = max(1, math.ceil((xmax - xmin) / _WORLDPOP_RES_DEG)) + ny = max(1, math.ceil((ymax - ymin) / _WORLDPOP_RES_DEG)) + return GridSpec( + shape=(ny, nx), + crs=4326, + dtype=np.dtype("float32"), + nodata=float("nan"), + time_dim=True, + ) + + def periods(self, start: str, end: str) -> list[str]: + """Return year strings in [start, end] clamped to version availability.""" + start_year = int(start[:4]) + end_year = int(end[:4]) + valid_range = (2015, 2030) if self.version == "global2" else (2000, 2020) + return [str(y) for y in range(max(start_year, valid_range[0]), min(end_year, valid_range[1]) + 1)] + + def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: + """Download a per-country GeoTIFF, clip to bbox, return as Dataset.""" + import requests + import rioxarray + + year = int(period_id) + url = self._url_for_year(year) + logger.info("Fetching WorldPop %s %s: %s", self.country_code, period_id, url) + resp = requests.get(url, timeout=300) + resp.raise_for_status() + + da = rioxarray.open_rasterio(io.BytesIO(resp.content)) + if not isinstance(da, xr.DataArray): + raise TypeError(f"rioxarray.open_rasterio returned {type(da).__name__!r}, expected DataArray") + xmin, ymin, xmax, ymax = map(float, bbox) + da = da.rio.clip_box(minx=xmin, miny=ymin, maxx=xmax, maxy=ymax) + da = da.squeeze("band", drop=True) + # Mask the rioxarray nodata sentinel (-99999) to NaN before writing. + _nodata = da.rio.nodata + if _nodata is not None and not math.isnan(float(_nodata)): + da = da.where(da != _nodata) + da = da.load() + + ds = da.to_dataset(name="pop_total") + return ds.expand_dims(time=[np.datetime64(f"{period_id}-01-01", "D")]) # type: ignore[no-any-return] + + # ------------------------------------------------------------------ + # URL construction + # ------------------------------------------------------------------ + + def _url_for_year(self, year: int) -> str: + cc = self.country_code + if self.version == "global2": + filename = f"{cc.lower()}_pop_{year}_CN_100m_R2025A_v1.tif" + return ( + f"https://data.worldpop.org/GIS/Population/Global_2015_2030/R2025A/" + f"{year}/{cc}/v1/100m/constrained/{filename}" + ) + if self.version == "global1": + filename = f"{cc.lower()}_ppp_{year}_UNadj.tif" + return f"https://data.worldpop.org/GIS/Population/Global_2000_2020/{year}/{cc}/{filename}" + raise ValueError(f"Unknown WorldPop version: {self.version!r}") diff --git a/climate_api/ingest/protocol.py b/climate_api/ingest/protocol.py new file mode 100644 index 00000000..fe47b7df --- /dev/null +++ b/climate_api/ingest/protocol.py @@ -0,0 +1,168 @@ +"""Plugin protocol and shared data types for per-period Icechunk ingest.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import date, timedelta +from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable + +import numpy as np + +if TYPE_CHECKING: + import xarray as xr + + +@dataclass +class GridSpec: + """Source grid metadata returned by a plugin probe. + + The orchestrator uses this to write GeoZarr attributes (CRS, bbox, dtype, + nodata) before the first period is written. Shape is logged but chunking + is not currently applied from this value. Set time_dim=False for + static (time-invariant) datasets — the orchestrator branches on this flag + and issues a single write with no append dimension. + + extra_dims: optional non-spatial, non-time dimensions in the store, e.g. + {"age_group": 20, "sex": 2}. The orchestrator does not use this field; + it exists for plugin authors who need to document multidimensional + stores and for future orchestrator extensions. + """ + + shape: tuple[int, int] + crs: int + dtype: np.dtype + nodata: float | None = None + time_dim: bool = True + x_dim: str = "x" + y_dim: str = "y" + attrs: dict[str, Any] = field(default_factory=dict) + extra_dims: dict[str, int] = field(default_factory=dict) + + +@runtime_checkable +class IngestionPlugin(Protocol): + """Minimal interface a plugin must implement for per-period Icechunk ingest. + + The climate-api layer owns the orchestration loop — plugins never touch + zarr or Icechunk directly. Implement the three sync methods and declare + max_concurrency and commit_batch_size as class attributes. + + max_concurrency: maximum number of fetch_period calls in flight at once. + Keep at 1 for sources with large per-period files or rate-limited APIs. + Raise for sources where individual periods are small (< 50 MB). + + commit_batch_size: cursor checkpoint interval. + Every period is always committed individually to Icechunk. This + attribute controls how frequently the orchestrator persists the job + cursor so that a restart resumes from the last checkpoint rather than + re-scanning the whole store. Use 1 for monthly sources, ~30 for daily, + ~720 for hourly. + + rechunk_time (optional class attribute): target time chunk size for the + post-ingest rechunk. When set, the orchestrator rewrites the store after + all periods are committed so the time axis uses chunks of this size + instead of the per-period chunk-of-1. Set to a positive int (30 for + daily, 720 for hourly) to enable rechunking. Omitting the attribute + entirely is equivalent to ``None`` — the orchestrator uses + ``getattr(plugin, "rechunk_time", None)`` so plugins that omit it still + pass the ``isinstance`` check. + + pyramid (optional class attribute): when ``True``, the orchestrator builds + a multiscale pyramid after ingest completes. Level count is derived + automatically from the spatial dimensions (same 512-pixel tile target + and 2048×2048 threshold as the legacy downloader). Set on plugins whose + data resolution produces tiles too large for efficient browser rendering + without overviews. Like ``rechunk_time``, read via ``getattr``. + """ + + max_concurrency: int + commit_batch_size: int + + def probe(self, bbox: list[float], **params: Any) -> GridSpec: + """Metadata-only source probe. Returns grid spec. No data transfer.""" + ... + + def periods(self, start: str, end: str) -> list[str]: + """Return the ordered list of available period IDs from start to end. + + Must be pure computation — no I/O. The orchestrator calls periods() + directly (not via asyncio.to_thread), so blocking here stalls the event + loop. Apply any availability cutoff inside this method using today's date + and a fixed lag constant rather than querying the upstream source. + Use enumerate_periods() as a helper for standard daily/hourly/yearly types. + """ + ... + + def fetch_period(self, period_id: str, bbox: list[float], **params: Any) -> "xr.Dataset": + """Fetch one period. Return a dataset in the source CRS. + + The returned dataset must have a 'time' dimension with a single + coordinate value. Spatial dimensions must match spec.x_dim / spec.y_dim. + The orchestrator handles zarr writes — never call to_zarr here. + """ + ... + + +def enumerate_periods(start: str, end: str, period_type: str, cutoff: date | None = None) -> list[str]: + """Generate ordered period IDs for [start, end], optionally clamped to cutoff. + + period_type values and ID formats: + 'daily' → YYYY-MM-DD + 'hourly' → YYYY-MM-DDTHH + 'monthly' → YYYY-MM + 'yearly' → YYYY + + cutoff clips the end of the range to the last period on or before that date. + For 'hourly', the cutoff is inclusive through the final hour of the cutoff date. + """ + if period_type == "daily": + s = date.fromisoformat(start[:10]) + e = date.fromisoformat(end[:10]) + if cutoff: + e = min(e, cutoff) + result: list[str] = [] + cur = s + while cur <= e: + result.append(cur.isoformat()) + cur += timedelta(days=1) + return result + + if period_type == "hourly": + cap = f"{cutoff.isoformat()}T23" if cutoff else None + eff_end = min(end, cap) if cap else end + if start > eff_end: + return [] + result = [] + cur = date.fromisoformat(start[:10]) + end_date = date.fromisoformat(eff_end[:10]) + while cur <= end_date: + for h in range(24): + p = f"{cur.isoformat()}T{h:02d}" + if p < start or p > eff_end: + continue + result.append(p) + cur += timedelta(days=1) + return result + + if period_type == "monthly": + sy, sm = int(start[:4]), int(start[5:7]) if len(start) >= 7 else 1 + ey, em = int(end[:4]), int(end[5:7]) if len(end) >= 7 else 12 + if cutoff: + ey, em = min((ey, em), (cutoff.year, cutoff.month)) + result = [] + y, m = sy, sm + while (y, m) <= (ey, em): + result.append(f"{y:04d}-{m:02d}") + m += 1 + if m > 12: + m, y = 1, y + 1 + return result + + if period_type == "yearly": + sy = int(start[:4]) + ey = int(end[:4]) + if cutoff: + ey = min(ey, cutoff.year) + return [str(y) for y in range(sy, ey + 1)] + + raise ValueError(f"Unknown period_type: {period_type!r}") diff --git a/climate_api/ingest/store.py b/climate_api/ingest/store.py new file mode 100644 index 00000000..00a5a5a1 --- /dev/null +++ b/climate_api/ingest/store.py @@ -0,0 +1,176 @@ +"""Icechunk store lifecycle helpers.""" + +from __future__ import annotations + +import logging +import math +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import icechunk + +_PYRAMID_PIXEL_THRESHOLD = 2048 * 2048 +_PYRAMID_TARGET_TILE_SIZE = 512 +_PYRAMID_MAX_LEVELS = 8 + +logger = logging.getLogger(__name__) + + +def open_or_create_repo(store_path: Path) -> "icechunk.Repository": + """Open an existing Icechunk repository or create one at store_path.""" + import icechunk + + storage = icechunk.local_filesystem_storage(str(store_path)) + if store_path.exists(): + return icechunk.Repository.open(storage) + return icechunk.Repository.create(storage) + + +def rechunk_store(store_path: Path, *, time_chunk: int) -> None: + """Rewrite the committed Icechunk store with a coarser time chunk size. + + Opens the latest committed snapshot for reading and a new writable session + for writing, lazily rechunks the time dimension via dask, then commits the + result as a new snapshot. Icechunk's MVCC ensures the previous snapshot is + preserved — if the rechunk fails the store rolls back to its original state. + + A no-op when the store does not exist or has no time dimension. + """ + import xarray as xr + + if not store_path.exists(): + return + + repo = open_or_create_repo(store_path) + read_session = repo.readonly_session("main") + ds = xr.open_zarr(read_session.store) + try: + n_times = ds.sizes.get("time", 0) + if n_times == 0: + return + + effective_chunk = min(time_chunk, n_times) + # Keys that are CF conventions attrs, not valid zarr encoding parameters. + # xarray copies them into .encoding when reading from zarr; strip them + # before passing back to to_zarr() to avoid ValueError. + _INVALID_ZARR_KEYS = frozenset({"scale_factor", "add_offset", "missing_value", "_FillValue", "coordinates"}) + encoding: dict[str, dict] = {} + for name in list(ds.data_vars) + list(ds.coords): + da = ds[name] + existing = {k: v for k, v in da.encoding.items() if k not in _INVALID_ZARR_KEYS} + if "time" in da.dims: + current = existing.get("chunks") + if isinstance(current, (list, tuple)): + new_chunks = list(current) + new_chunks[list(da.dims).index("time")] = effective_chunk + else: + new_chunks = [effective_chunk if dim == "time" else da.sizes[dim] for dim in da.dims] + existing["chunks"] = new_chunks + encoding[name] = existing # pyright: ignore[reportArgumentType] + + write_session = repo.writable_session("main") + ds.chunk({"time": effective_chunk}).to_zarr(write_session.store, mode="w", encoding=encoding) + write_session.commit(f"rechunk: time={effective_chunk}") + logger.info("Rechunked %s: time chunk → %d (%d periods)", store_path, effective_chunk, n_times) + finally: + ds.close() + + +def build_pyramid_store(store_path: Path, *, x_dim: str = "x", y_dim: str = "y") -> None: + """Rewrite the committed Icechunk store as a multiscale pyramid. + + Level count is derived from the actual spatial dimensions using the same + 512-pixel tile target as the legacy downloader. A no-op when the store does + not exist or its spatial extent is below the 2048×2048 threshold. + + The pyramid commit replaces the flat root structure: data moves from root + to ``0/`` and coarsened overviews are written to ``1/``, ``2/``, etc. + Intermediate ingest snapshots are left for the orchestrator's + expire_snapshots call to prune. + """ + import xarray as xr + from topozarr import create_pyramid + + if not store_path.exists(): + return + + import zarr + + repo = open_or_create_repo(store_path) + read_session = repo.readonly_session("main") + ds = xr.open_zarr(read_session.store) + try: + nx = ds.sizes.get(x_dim, 0) + ny = ds.sizes.get(y_dim, 0) + if nx * ny <= _PYRAMID_PIXEL_THRESHOLD: + logger.info("Skipping pyramid for %s: %dx%d below threshold", store_path, nx, ny) + return + levels = min(math.ceil(math.log2(max(nx, ny) / _PYRAMID_TARGET_TILE_SIZE)), _PYRAMID_MAX_LEVELS) + ds_loaded = ds.load() + finally: + ds.close() + + # topozarr requires xproj CRS on the dataset. Read it from the GeoZarr + # root attribute written by the orchestrator (proj:code = "EPSG:"). + try: + import xproj # noqa: F401 # pyright: ignore[reportUnusedImport] + + root = zarr.open_group(read_session.store, mode="r") + proj_code = str(root.attrs.get("proj:code", "EPSG:4326")) + epsg = int(proj_code.split(":")[1]) if ":" in proj_code else 4326 + ds_loaded = ds_loaded.proj.assign_crs({"EPSG": epsg}) + except Exception: + logger.warning("Could not assign CRS for pyramid build on %s; proceeding without xproj", store_path) + + pyramid = create_pyramid(ds_loaded, levels=levels, x_dim=x_dim, y_dim=y_dim) + # Strip "shards" from topozarr encoding: sharding_indexed codec isn't supported + # by zarr-layer (JS) client, causing a render loop in the map viewer. + no_shard_encoding = { + level: {var: {k: v for k, v in enc.items() if k != "shards"} for var, enc in vars_.items()} + for level, vars_ in pyramid.encoding.items() + } + write_session = repo.writable_session("main") + pyramid.dt.to_zarr(write_session.store, mode="w", encoding=no_shard_encoding, zarr_format=3) + write_session.commit(f"pyramid: {levels} levels") + logger.info("Built %d-level pyramid for %s (%dx%d)", levels, store_path, nx, ny) + + +def read_committed_period_ids(store_path: Path, period_type: str) -> set[str]: + """Return the set of period IDs already committed to the Icechunk store. + + Reads the time coordinate from the last committed snapshot and converts + each timestamp back to a period string using the dataset's period_type. + Returns an empty set when the store does not yet exist or has no time dim. + """ + import xarray as xr + + from climate_api.shared.time import datetime_to_period_string + + if not store_path.exists(): + return set() + + try: + repo = open_or_create_repo(store_path) + session = repo.readonly_session("main") + ds = xr.open_zarr(session.store) + try: + if "time" not in ds.coords: + # Pyramid store: time lives in level "0", not at root. + if "multiscales" not in ds.attrs: + return set() + ds.close() + ds = xr.open_zarr(session.store, group="0") + if "time" not in ds.coords: + # Timeless (static) store: if it has spatial data, treat as complete. + if ds.sizes and all(s > 0 for s in ds.sizes.values()): + return {"static"} + return set() + import pandas as pd + + return {datetime_to_period_string(pd.Timestamp(t.item()).to_pydatetime(), period_type) for t in ds.time} + finally: + ds.close() + except Exception: + logger.debug("Could not read committed periods from %s", store_path, exc_info=True) + return set() diff --git a/climate_api/ingestions/execution.py b/climate_api/ingestions/execution.py new file mode 100644 index 00000000..45bbac79 --- /dev/null +++ b/climate_api/ingestions/execution.py @@ -0,0 +1,49 @@ +"""Execution function for the ingest process (used by the async jobs framework).""" + +from __future__ import annotations + +from typing import Any + +from fastapi import HTTPException + +from climate_api.data_registry.services import datasets as registry_datasets +from climate_api.extents.services import get_extent_or_404 +from climate_api.ingestions import services + + +def execute_ingest( + *, + dataset_id: str, + start: str, + end: str | None = None, + overwrite: bool = False, + publish: bool = True, + on_progress: Any | None = None, + is_cancel_requested: Any | None = None, + save_cursor: Any | None = None, +) -> dict[str, Any]: + """Ingest one dataset for the configured extent and return a result summary. + + Accepts optional job-framework callbacks (on_progress, is_cancel_requested, + save_cursor) so that progress is visible when run as an async job. + """ + dataset = registry_datasets.get_dataset(dataset_id) + if dataset is None: + raise HTTPException(status_code=404, detail=f"Dataset '{dataset_id}' not found") + extent = get_extent_or_404() + resolved_bbox = list(extent["bbox"]) + artifact = services.create_artifact( + dataset=dataset, + start=start, + end=end, + bbox=resolved_bbox, + overwrite=overwrite, + publish=publish, + on_progress=on_progress, + is_cancel_requested=is_cancel_requested, + save_cursor=save_cursor, + ) + return { + "status": "completed", + "ingestion_id": artifact.artifact_id, + } diff --git a/climate_api/ingestions/routes.py b/climate_api/ingestions/routes.py index b558fe22..38d997a6 100644 --- a/climate_api/ingestions/routes.py +++ b/climate_api/ingestions/routes.py @@ -1,7 +1,8 @@ """Routes for EO ingestion, datasets, and sync operations.""" -from fastapi import APIRouter, HTTPException -from fastapi.responses import FileResponse +from typing import Any + +from fastapi import APIRouter, Header from starlette.responses import Response from climate_api.data_registry.routes import _get_dataset_or_404 @@ -24,21 +25,51 @@ sync_router = APIRouter() -@ingestions_router.post("", response_model=IngestionResponse) -def create_ingestion(request: CreateIngestionRequest) -> IngestionResponse: - """Create or update a managed dataset from a dataset template and configured extent.""" +def _prefer_respond_async(prefer: str | None) -> bool: + if prefer is None: + return False + directives = [item.strip().split(";", 1)[0].strip().lower() for item in prefer.split(",")] + return "respond-async" in directives + + +@ingestions_router.post("", response_model=None) +def create_ingestion( + request: CreateIngestionRequest, + response: Response, + prefer: str | None = Header(default=None), +) -> Any: + """Create or update a managed dataset from a dataset template and configured extent. + + Send ``Prefer: respond-async`` to run the ingest as a background job and + receive HTTP 202 with a ``Location: /jobs/{job_id}`` header immediately. + Poll ``GET /jobs/{job_id}`` for progress and completion status. + """ + if _prefer_respond_async(prefer): + from climate_api.jobs.service import get_job_service + + job = get_job_service().submit_process_job( + process_id="ingest", + request={ + "dataset_id": request.dataset_id, + "start": request.start, + "end": request.end, + "overwrite": request.overwrite, + "publish": request.publish, + }, + ) + response.status_code = 202 + response.headers["Location"] = f"/jobs/{job.job_id}" + return job + dataset = _get_dataset_or_404(request.dataset_id) extent = get_extent_or_404() resolved_bbox = list(extent["bbox"]) - resolved_country_code = extent.get("country_code") artifact = services.create_artifact( dataset=dataset, start=request.start, end=request.end, bbox=resolved_bbox, - country_code=resolved_country_code, overwrite=request.overwrite, - prefer_zarr=request.prefer_zarr, publish=request.publish, ) return IngestionResponse( @@ -72,21 +103,6 @@ def get_dataset(dataset_id: str) -> DatasetDetailRecord: return services.get_dataset_or_404(dataset_id) -@datasets_router.get("/{dataset_id}/download") -def download_artifact_file(dataset_id: str) -> FileResponse: - """Download the primary saved file for a dataset when available.""" - artifact = services.get_latest_artifact_for_dataset_or_404(dataset_id) - if artifact.path is None or artifact.format.value == "zarr": - raise HTTPException( - status_code=409, - detail="Dataset is not a single downloadable file; use metadata and dataset assets instead", - ) - - media_type = "application/x-netcdf" - filename = f"{dataset_id}.nc" - return FileResponse(artifact.path, media_type=media_type, filename=filename) - - @zarr_router.api_route("/{dataset_id}", methods=["GET", "HEAD"]) def get_canonical_zarr_store_info(dataset_id: str) -> dict[str, object]: """Return canonical Zarr store listing for a managed dataset.""" @@ -94,7 +110,7 @@ def get_canonical_zarr_store_info(dataset_id: str) -> dict[str, object]: @zarr_router.api_route("/{dataset_id}/{relative_path:path}", methods=["GET", "HEAD"], response_model=None) -def get_canonical_zarr_store_file(dataset_id: str, relative_path: str) -> FileResponse | Response | dict[str, object]: +def get_canonical_zarr_store_file(dataset_id: str, relative_path: str) -> Response | dict[str, object]: """Serve canonical Zarr store content for a managed dataset.""" return services.get_dataset_zarr_store_file_or_404(dataset_id, relative_path) @@ -105,7 +121,6 @@ def sync_dataset(dataset_id: str, request: SyncDatasetRequest) -> SyncResponse: return services.sync_dataset( dataset_id=dataset_id, end=request.end, - prefer_zarr=request.prefer_zarr, publish=request.publish, ) diff --git a/climate_api/ingestions/schemas.py b/climate_api/ingestions/schemas.py index eb3d7292..74150e7d 100644 --- a/climate_api/ingestions/schemas.py +++ b/climate_api/ingestions/schemas.py @@ -10,7 +10,7 @@ class ArtifactFormat(StrEnum): """Supported stored artifact formats.""" ZARR = "zarr" - NETCDF = "netcdf" + ICECHUNK = "icechunk" class PublicationStatus(StrEnum): @@ -53,8 +53,12 @@ class CoverageSpatial(BaseModel): class CoverageTemporal(BaseModel): """Temporal extent summary.""" - start: str = Field(description="First covered time period in dataset-native string form.") - end: str = Field(description="Last covered time period in dataset-native string form.") + start: str | None = Field( + description="First covered time period in dataset-native string form. None for static (timeless) datasets." + ) + end: str | None = Field( + description="Last covered time period in dataset-native string form. None for static (timeless) datasets." + ) class ArtifactCoverage(BaseModel): @@ -98,7 +102,6 @@ class ArtifactRecord(BaseModel): variable: str period_type: str | None = None format: ArtifactFormat - path: str | None = None asset_paths: list[str] = Field(default_factory=list) variables: list[str] = Field(default_factory=list) request_scope: ArtifactRequestScope @@ -117,10 +120,6 @@ class CreateIngestionRequest(BaseModel): default=False, description="Whether to force regeneration of an existing matching artifact.", ) - prefer_zarr: bool = Field( - default=True, - description="Whether to prefer GeoZarr materialization when available.", - ) publish: bool = Field( default=True, description="Whether to publish the resulting dataset through pygeoapi.", @@ -272,7 +271,6 @@ class SyncDatasetRequest(BaseModel): """Request payload for syncing a managed dataset forward.""" end: str | None = Field(default=None, description="Optional end period to sync through.") - prefer_zarr: bool = Field(default=True, description="Whether to prefer GeoZarr materialization when syncing.") publish: bool = Field(default=True, description="Whether to publish the resulting dataset version.") diff --git a/climate_api/ingestions/services.py b/climate_api/ingestions/services.py index 04fbe709..51e4ecae 100644 --- a/climate_api/ingestions/services.py +++ b/climate_api/ingestions/services.py @@ -6,19 +6,22 @@ import logging import mimetypes import os +import threading from collections.abc import Callable from datetime import UTC, datetime from pathlib import Path +from typing import Any from uuid import uuid4 import portalocker import pyproj +import xarray as xr from fastapi import HTTPException -from fastapi.responses import FileResponse, JSONResponse +from fastapi.responses import FileResponse, HTMLResponse, JSONResponse from starlette.responses import Response from climate_api import config as api_config -from climate_api.data_accessor.services.accessor import get_data_coverage_for_paths +from climate_api.data_accessor.services.accessor import coverage_from_open_dataset, get_data_coverage_for_paths from climate_api.data_manager.services import downloader from climate_api.data_registry.services import datasets as registry_datasets from climate_api.extents.services import get_extent @@ -49,6 +52,77 @@ logger = logging.getLogger(__name__) +# Per-store threading locks prevent two concurrent ingest/sync runs from writing +# to the same Icechunk store simultaneously (which causes MVCC commit conflicts). +_store_locks: dict[str, threading.Lock] = {} +_store_locks_mutex = threading.Lock() + + +def _acquire_store_lock(store_path: Path) -> threading.Lock: + """Return the exclusive lock for store_path, creating it if needed.""" + key = str(store_path.resolve()) + with _store_locks_mutex: + if key not in _store_locks: + _store_locks[key] = threading.Lock() + return _store_locks[key] + + +def _check_bbox_overlap(dataset: dict[str, object], instance_bbox: list[float]) -> None: + """Raise HTTP 400 if the dataset's declared spatial extent does not overlap the instance bbox.""" + extents = dataset.get("extents") + if not isinstance(extents, dict): + return + spatial = extents.get("spatial") + if not isinstance(spatial, dict): + return + dataset_bbox = spatial.get("bbox") + if not (isinstance(dataset_bbox, list) and len(dataset_bbox) == 4): + return + try: + dx_min, dy_min, dx_max, dy_max = (float(v) for v in dataset_bbox) + except (TypeError, ValueError): + return # malformed dataset bbox — skip overlap check + ix_min, iy_min, ix_max, iy_max = (float(v) for v in instance_bbox) + if dx_max <= ix_min or dx_min >= ix_max or dy_max <= iy_min or dy_min >= iy_max: + raise HTTPException( + status_code=400, + detail=( + f"Dataset '{dataset.get('id')}' spatial extent {dataset_bbox} " + f"does not overlap the configured instance extent {instance_bbox}" + ), + ) + + +def _read_crs_from_spatial_ref(ds: xr.Dataset) -> str | None: + """Return an EPSG CRS string from a dataset, or None if undetectable. + + Checks spatial_ref WKT first, then falls back to dimension units/standard_name + so that datasets like ERA5-Land (no spatial_ref, but degrees_east/north units) + are not misidentified as projected. + """ + if "spatial_ref" in ds.coords: + try: + import pyproj + + attrs = dict(ds["spatial_ref"].attrs) + wkt = attrs.get("crs_wkt") or attrs.get("spatial_ref") + if wkt: + epsg = pyproj.CRS.from_wkt(str(wkt)).to_epsg() + if epsg: + return f"EPSG:{epsg}" + except Exception: + pass + for dim in set(ds.dims): + if dim not in ds.coords: + continue + attrs = dict(ds[dim].attrs) + if attrs.get("units") in ("degrees_east", "degrees_north") or attrs.get("standard_name") in ( + "longitude", + "latitude", + ): + return "EPSG:4326" + return None + def _resolve_artifacts_dir() -> Path: from climate_api import config as api_config @@ -151,28 +225,30 @@ def create_artifact( start: str, end: str | None, bbox: list[float] | None, - country_code: str | None, overwrite: bool, - prefer_zarr: bool, publish: bool, download_start: str | None = None, download_end: str | None = None, + on_progress: Any | None = None, + is_cancel_requested: Any | None = None, + save_cursor: Any | None = None, ) -> ArtifactRecord: - """Download a dataset, persist it locally, and store artifact metadata.""" + """Ingest a dataset via its plugin, persist it locally, and store artifact metadata.""" period_type = str(dataset["period_type"]) start = _normalize_request_period(start, period_type=period_type, field_name="start") - end = _normalize_optional_request_period(end, period_type=period_type, field_name="end") + end = _normalize_optional_request_period(end, period_type=period_type, field_name="end", is_end=True) download_start = _normalize_optional_request_period( download_start, period_type=period_type, field_name="download_start" ) - download_end = _normalize_optional_request_period(download_end, period_type=period_type, field_name="download_end") + download_end = _normalize_optional_request_period( + download_end, period_type=period_type, field_name="download_end", is_end=True + ) _validate_download_scope( start=start, end=end, download_start=download_start, download_end=download_end, ) - requires_canonical_zarr = download_start is not None resolved_download_end = download_end if download_end is not None else end if resolved_download_end is None: resolved_download_end = _default_request_end(period_type) @@ -184,7 +260,6 @@ def create_artifact( existing = _find_existing_artifact( dataset_id=str(dataset["id"]), request_scope=request_scope, - prefer_zarr=prefer_zarr or requires_canonical_zarr, ) if existing is not None and not overwrite: logger.info( @@ -198,124 +273,188 @@ def create_artifact( return publish_artifact_record(existing.artifact_id) return existing - logger.info( - "Downloading dataset '%s': request_scope=%s..%s download_scope=%s..%s prefer_zarr=%s publish=%s", - dataset["id"], - start, - end, - download_start or start, - resolved_download_end, - prefer_zarr, - publish, - ) - downloaded_files = downloader.download_dataset( - dataset, - start=download_start or start, + return _create_icechunk_artifact( + dataset=dataset, + start=start, end=resolved_download_end, bbox=bbox, - country_code=country_code, + request_scope=request_scope, overwrite=overwrite, - background_tasks=None, + publish=publish, + ingest_start=download_start, + on_progress=on_progress, + is_cancel_requested=is_cancel_requested, + save_cursor=save_cursor, ) - logger.info("Download finished for dataset '%s': changed_files=%d", dataset["id"], len(downloaded_files)) - if prefer_zarr or requires_canonical_zarr: - try: - logger.info("Building canonical Zarr artifact for dataset '%s'", dataset["id"]) - downloader.build_dataset_zarr(dataset, start=start, end=end) - logger.info("Canonical Zarr artifact built for dataset '%s'", dataset["id"]) - except Exception as exc: - if requires_canonical_zarr: - if isinstance(exc, ValueError): - raise HTTPException( - status_code=409, - detail=f"Append sync canonical Zarr rebuild failed for requested scope: {exc}", - ) from exc - raise HTTPException( - status_code=500, - detail="Append sync canonical Zarr rebuild failed unexpectedly.", - ) from exc - # Fall back to NetCDF when Zarr materialization is not viable. - logger.warning( - "Zarr materialization failed for dataset '%s'; falling back to NetCDF", - dataset["id"], - exc_info=True, - ) - zarr_path = downloader.get_zarr_path(dataset) - if requires_canonical_zarr and zarr_path is None: +def _create_icechunk_artifact( + *, + dataset: dict[str, object], + start: str, + end: str, + bbox: list[float] | None, + request_scope: ArtifactRequestScope, + overwrite: bool = False, + publish: bool, + ingest_start: str | None = None, + on_progress: Any | None = None, + is_cancel_requested: Any | None = None, + save_cursor: Any | None = None, +) -> ArtifactRecord: + """Run per-period Icechunk ingest and register the resulting store as an artifact. + + `ingest_start` is the period from which the orchestrator begins its period scan. + For delta/append syncs this is the first missing period (delta_start), which avoids + enumerating the entire historical range just to discover that all prior periods are + already committed. When omitted the full artifact `start` is used. + """ + from climate_api.ingest.orchestrator import load_plugin, run_ingest_sync + from climate_api.ingest.store import open_or_create_repo + + dataset_id = str(dataset["id"]) + period_type = str(dataset["period_type"]) + _raw_ingestion = dataset.get("ingestion") + ingestion: dict[str, object] = dict(_raw_ingestion) if isinstance(_raw_ingestion, dict) else {} + plugin_path = str(ingestion["plugin"]) + _raw_params = ingestion.get("params") + params: dict[str, object] = dict(_raw_params) if isinstance(_raw_params, dict) else {} + + extent = get_extent() + resolved_bbox: list[float] = ( + list(bbox) if bbox is not None else (list(extent["bbox"]) if extent else [-180, -90, 180, 90]) + ) + _check_bbox_overlap(dataset, resolved_bbox) + store_path = downloader.get_icechunk_path(dataset) + + lock = _acquire_store_lock(store_path) + if not lock.acquire(blocking=False): raise HTTPException( - status_code=500, - detail="Append sync requires a canonical Zarr artifact, but no Zarr store was produced.", + status_code=409, + detail=f"An ingest or sync is already running for dataset '{dataset_id}'. Wait for it to finish.", ) - cache_files = ( - downloader.get_cache_files(dataset) - if requires_canonical_zarr - else downloaded_files or downloader.get_cache_files(dataset) - ) - primary_path: str | None - - if zarr_path is not None: - artifact_format = ArtifactFormat.ZARR - primary_path = str(zarr_path.resolve()) - asset_paths = [primary_path] - elif cache_files: - artifact_format = ArtifactFormat.NETCDF - asset_paths = [str(path.resolve()) for path in cache_files] - primary_path = asset_paths[0] if len(asset_paths) == 1 else None - else: - raise HTTPException(status_code=500, detail="Download finished without any saved artifact files") - - coverage_data = get_data_coverage_for_paths( - dataset, - zarr_path=primary_path if artifact_format == ArtifactFormat.ZARR else None, - netcdf_paths=asset_paths if artifact_format == ArtifactFormat.NETCDF else None, - ) + try: + if overwrite and store_path.exists(): + import shutil + + shutil.rmtree(store_path) + logger.info("Cleared existing store for overwrite: %s", store_path) + + extent_country_code = extent.get("country_code") if extent else None + extra_params: dict[str, object] = {} + if extent_country_code: + extra_params["country_code"] = extent_country_code + try: + plugin = load_plugin(plugin_path, params, extra_params=extra_params or None) + except (TypeError, ValueError) as exc: + raise HTTPException(status_code=400, detail=f"Plugin configuration error: {exc}") from exc + + effective_start = ingest_start if ingest_start is not None else start + # Rechunk after the initial ingest (when no delta start is provided) using the + # plugin's declared rechunk_time, if any. Sync appends skip rechunking to avoid + # rewriting the full store on every small update. + rechunk_time: int | None = getattr(plugin, "rechunk_time", None) if ingest_start is None else None + pyramid: bool = bool(getattr(plugin, "pyramid", False)) if ingest_start is None else False + logger.info( + "Running Icechunk ingest for '%s': ingest_scope=%s..%s artifact_scope=%s..%s rechunk_time=%s pyramid=%s", + dataset_id, + effective_start, + end, + start, + end, + rechunk_time, + pyramid, + ) + transforms = dataset.get("transforms") + apply_transforms = (lambda ds: downloader._run_transforms(ds, dataset)) if transforms else None + run_ingest_sync( + plugin=plugin, + params=params, + bbox=resolved_bbox, + start=effective_start, + end=end, + store_path=store_path, + period_type=period_type, + rechunk_time=rechunk_time, + apply_transforms=apply_transforms, + pyramid=pyramid, + on_progress=on_progress, + is_cancel_requested=is_cancel_requested, + save_cursor=save_cursor, + ) + finally: + lock.release() + + if not store_path.exists(): + raise HTTPException(status_code=409, detail="Plugin returned no periods for the requested range") + + repo = open_or_create_repo(store_path) + session = repo.readonly_session("main") + import xarray as xr + + try: + ds = xr.open_zarr(session.store) + # For pyramid stores the data and time live under group "0"; root has + # only multiscales metadata with empty coordinates. + if "time" not in ds.coords and "multiscales" in ds.attrs: + ds.close() + ds = xr.open_zarr(session.store, group="0") + except Exception as exc: + raise HTTPException(status_code=409, detail="Ingest produced no readable data for the requested range") from exc + from climate_api import config as api_config + + native_crs = _read_crs_from_spatial_ref(ds) or api_config.get_crs() or "EPSG:4326" + try: + coverage_data = coverage_from_open_dataset(ds, period_type=period_type, native_crs=native_crs) + finally: + ds.close() + if not coverage_data.get("has_data", True): - raise HTTPException(status_code=409, detail="Downloaded artifact contains no data for the requested scope") + raise HTTPException(status_code=409, detail="Icechunk store contains no data for the requested scope") + _spatial_wgs84_data = coverage_data["coverage"].get("spatial_wgs84") coverage = ArtifactCoverage( temporal=CoverageTemporal(**coverage_data["coverage"]["temporal"]), spatial=CoverageSpatial(**coverage_data["coverage"]["spatial"]), spatial_wgs84=CoverageSpatial(**_spatial_wgs84_data) if _spatial_wgs84_data else None, ) - if not _temporal_coverage_matches_request_scope(coverage.temporal, request_scope): - raise HTTPException( - status_code=409, - detail=( - "Materialized artifact coverage does not match the requested scope: " - f"coverage={coverage.temporal.start}..{coverage.temporal.end}, " - f"request={request_scope.start}..{request_scope.end}" - ), + + # When a plugin clamps availability (e.g. CHIRPS3 has a 3-month lag), the + # realized coverage end is earlier than the requested end. Normalise the + # stored request_scope to the actual coverage end so that + # _artifact_coverage_matches_request_scope passes on future requests for the + # same realized range, instead of triggering an unnecessary re-ingest. + if request_scope.end is not None and coverage.temporal.end != request_scope.end: + request_scope = ArtifactRequestScope( + start=request_scope.start, + end=coverage.temporal.end, + bbox=request_scope.bbox, ) record = ArtifactRecord( artifact_id=str(uuid4()), - dataset_id=str(dataset["id"]), + dataset_id=dataset_id, dataset_name=str(dataset["name"]), variable=str(dataset["variable"]), - format=artifact_format, - path=primary_path, - asset_paths=asset_paths, + format=ArtifactFormat.ICECHUNK, + asset_paths=[str(store_path.resolve())], variables=[str(dataset["variable"])], request_scope=request_scope, coverage=coverage, created_at=datetime.now(UTC), publication=ArtifactPublication(), ) - stored_record = _store_artifact_record(record, prefer_zarr=prefer_zarr, publish=publish) + stored = _upsert_icechunk_artifact_record(record) logger.info( - "Stored artifact '%s' for dataset '%s': format=%s coverage=%s..%s", - stored_record.artifact_id, - dataset["id"], - stored_record.format, - stored_record.coverage.temporal.start, - stored_record.coverage.temporal.end, + "Stored Icechunk artifact '%s' for '%s': coverage=%s..%s", + stored.artifact_id, + dataset_id, + stored.coverage.temporal.start, + stored.coverage.temporal.end, ) - if publish and stored_record.publication.status != PublicationStatus.PUBLISHED: - logger.info("Publishing artifact '%s' for dataset '%s'", stored_record.artifact_id, dataset["id"]) - return publish_artifact_record(stored_record.artifact_id) - return stored_record + if publish and stored.publication.status != PublicationStatus.PUBLISHED: + return publish_artifact_record(stored.artifact_id) + return stored def publish_artifact_record(artifact_id: str) -> ArtifactRecord: @@ -346,7 +485,7 @@ def store_materialized_zarr_artifact( """Store metadata for a locally materialized Zarr artifact.""" period_type = str(dataset["period_type"]) normalized_start = _normalize_request_period(start, period_type=period_type, field_name="start") - normalized_end = _normalize_optional_request_period(end, period_type=period_type, field_name="end") + normalized_end = _normalize_optional_request_period(end, period_type=period_type, field_name="end", is_end=True) request_scope = ArtifactRequestScope( start=normalized_start, end=normalized_end, @@ -373,7 +512,6 @@ def store_materialized_zarr_artifact( variable=str(dataset["variable"]), period_type=str(dataset.get("period_type")) if dataset.get("period_type") is not None else None, format=ArtifactFormat.ZARR, - path=str(zarr_path.resolve()), asset_paths=[str(zarr_path.resolve())], variables=[str(dataset["variable"])], request_scope=request_scope, @@ -381,7 +519,7 @@ def store_materialized_zarr_artifact( created_at=datetime.now(UTC), publication=ArtifactPublication(), ) - stored_record = _upsert_artifact_record(record, prefer_zarr=True, publish=publish, overwrite=overwrite) + stored_record = _upsert_artifact_record(record, publish=publish, overwrite=overwrite) if publish and stored_record.publication.status != PublicationStatus.PUBLISHED: return publish_artifact_record(stored_record.artifact_id) return stored_record @@ -391,31 +529,47 @@ def sync_dataset( *, dataset_id: str, end: str | None, - prefer_zarr: bool, publish: bool, + on_progress: Any | None = None, ) -> SyncResponse: """Resolve sync inputs and delegate managed-dataset sync to the sync engine. The service layer stays thin on purpose: it validates that the requested public dataset id resolves to a managed dataset plus a source template, then hands execution to `sync_engine.run_sync(...)`. + + For Icechunk artifacts the authoritative `current_end` is read directly from + the store's committed period log rather than from the potentially-stale artifact + metadata record, so the sync plan reflects the true on-disk state. """ latest_artifact = get_latest_artifact_for_dataset_or_404(dataset_id) source_dataset = registry_datasets.get_dataset(latest_artifact.dataset_id) if source_dataset is None: raise HTTPException(status_code=404, detail=f"Source dataset '{latest_artifact.dataset_id}' not found") - extent = get_extent() - resolved_country_code = extent.get("country_code") if extent else None + committed_end: str | None = None + if latest_artifact.format == ArtifactFormat.ICECHUNK and latest_artifact.asset_paths: + from climate_api.ingest.store import read_committed_period_ids + + period_type = str(source_dataset.get("period_type", "")) + committed = read_committed_period_ids(Path(latest_artifact.asset_paths[0]), period_type) + committed_end = max(committed) if committed else None + logger.info( + "Icechunk store-based current_end for '%s': %s (artifact record had: %s)", + dataset_id, + committed_end, + latest_artifact.coverage.temporal.end, + ) + try: return run_sync( latest_artifact=latest_artifact, source_dataset=source_dataset, requested_end=end, - country_code=resolved_country_code, - prefer_zarr=prefer_zarr, publish=publish, create_artifact_fn=create_artifact, get_dataset_fn=get_dataset_or_404, + current_end=committed_end, + on_progress=on_progress, ) except SyncConfigurationError as exc: raise HTTPException(status_code=500, detail=str(exc)) from exc @@ -433,11 +587,19 @@ def plan_sync_dataset( source_dataset = registry_datasets.get_dataset(latest_artifact.dataset_id) if source_dataset is None: raise HTTPException(status_code=404, detail=f"Source dataset '{latest_artifact.dataset_id}' not found") + committed_end: str | None = None + if latest_artifact.format == ArtifactFormat.ICECHUNK and latest_artifact.asset_paths: + from climate_api.ingest.store import read_committed_period_ids + + period_type = str(source_dataset.get("period_type", "")) + committed = read_committed_period_ids(Path(latest_artifact.asset_paths[0]), period_type) + committed_end = max(committed) if committed else None try: return plan_sync( latest_artifact=latest_artifact, source_dataset=source_dataset, requested_end=end, + current_end=committed_end, ) except SyncConfigurationError as exc: raise HTTPException(status_code=500, detail=str(exc)) from exc @@ -448,6 +610,10 @@ def plan_sync_dataset( def get_dataset_zarr_store_info_or_404(dataset_id: str) -> dict[str, object]: """Return a public Zarr store listing for a managed dataset.""" artifact = get_latest_artifact_for_dataset_or_404(dataset_id) + + if artifact.format == ArtifactFormat.ICECHUNK: + return _icechunk_store_info(dataset_id, artifact) + store_root = _get_zarr_root_or_409(artifact) entries = _zarr_entries(dataset_id=dataset_id, store_root=store_root, directory=store_root) @@ -457,7 +623,46 @@ def get_dataset_zarr_store_info_or_404(dataset_id: str) -> dict[str, object]: return { "kind": "ZarrListing", "dataset_id": dataset_id, - "format": artifact.format, + "format": "zarr", + "path": ".", + "crs": crs, + "proj4": _crs_to_proj4(crs), + "bounds": _read_zarr_bounds(store_attrs), + "entries": entries, + } + + +def _icechunk_store_info(dataset_id: str, artifact: ArtifactRecord) -> dict[str, object]: + """Return a Zarr store listing for an Icechunk-backed artifact.""" + import zarr + + from climate_api.ingest.store import open_or_create_repo + + store_path = Path(artifact.asset_paths[0]) + if not store_path.exists(): + raise HTTPException(status_code=404, detail="Icechunk store not found on disk") + + repo = open_or_create_repo(store_path) + session = repo.readonly_session("main") + + root: zarr.Group = zarr.open_group(session.store, mode="r") + store_attrs: dict[str, object] = dict(root.attrs) + + store_crs = store_attrs.get("proj:code") + crs = store_crs if isinstance(store_crs, str) and store_crs else api_config.get_crs() + entries = [ + { + "name": name, + "kind": "directory", + "href": f"/zarr/{dataset_id}/{name}", + } + for name in sorted(root.keys()) + ] + + return { + "kind": "ZarrListing", + "dataset_id": dataset_id, + "format": "zarr", "path": ".", "crs": crs, "proj4": _crs_to_proj4(crs), @@ -517,6 +722,10 @@ def get_dataset_zarr_store_file_or_404( ) -> FileResponse | Response | dict[str, object]: """Serve a file, metadata document, or directory listing within a dataset Zarr store.""" artifact = get_latest_artifact_for_dataset_or_404(dataset_id) + + if artifact.format == ArtifactFormat.ICECHUNK: + return _serve_icechunk_key(dataset_id, artifact, relative_path) + store_root = _get_zarr_root_or_409(artifact) target = _resolve_zarr_path(store_root, relative_path) if not target.exists(): @@ -532,10 +741,86 @@ def get_dataset_zarr_store_file_or_404( return FileResponse(target, media_type=media_type, filename=target.name) +def _serve_icechunk_key(dataset_id: str, artifact: ArtifactRecord, relative_path: str) -> Response | dict[str, object]: + """Serve a zarr v3 key from an Icechunk store via its session store.""" + import zarr + + from climate_api.ingest.store import open_or_create_repo + + store_path = Path(artifact.asset_paths[0]) + if not store_path.exists(): + raise HTTPException(status_code=404, detail="Icechunk store not found on disk") + + repo = open_or_create_repo(store_path) + session = repo.readonly_session("main") + key = relative_path.lstrip("/") + + # Directory-like paths: list child keys as a ZarrListing + if not key or key.endswith("/"): + root: zarr.Group = zarr.open_group(session.store, mode="r") + prefix = key.rstrip("/") + try: + node: zarr.Group = root[prefix] if prefix else root # type: ignore[assignment] + except KeyError: + raise HTTPException(status_code=404, detail=f"Zarr path '{relative_path}' not found") + entries = [ + { + "name": name, + "kind": "directory", + "href": f"/zarr/{dataset_id}/{prefix}/{name}".replace("//", "/"), + } + for name in sorted(node.keys()) + ] + return { + "kind": "ZarrListing", + "dataset_id": dataset_id, + "path": key or ".", + "entries": entries, + } + + # Detect bare group-path requests (e.g. "0", "0/precip"). + # fsspec HTTP _ls_real issues GET without a trailing slash; zarr chunk/metadata + # keys always contain a "." (zarr.json) or "/c/" (chunk coordinates), so anything + # that matches neither pattern must be a group path. Return an HTML directory + # listing so fsspec can parse the children via links. + last_segment = key.rsplit("/", 1)[-1] + is_chunk_key = "/c/" in key or key.startswith("c/") + is_file_key = "." in last_segment + if not is_chunk_key and not is_file_key: + root = zarr.open_group(session.store, mode="r") + try: + node = root[key] # type: ignore[assignment] + except KeyError: + raise HTTPException(status_code=404, detail=f"Zarr path '{relative_path}' not found") + children = sorted(node.keys()) + html_lines = [""] + for child in children: + html_lines.append(f'{child}/') + html_lines.append("") + return HTMLResponse("\n".join(html_lines)) + + try: + import zarr.core.buffer + + proto = zarr.core.buffer.default_buffer_prototype() + # IcechunkStore does not expose a public synchronous read method; _get_bytes_sync + # is the internal synchronous accessor used by zarr's own blocking read path. + data = session.store._get_bytes_sync(key, prototype=proto) + if data is None: # pyright: ignore[reportUnnecessaryComparison] + raise HTTPException(status_code=404, detail=f"Zarr key '{relative_path}' not found in store") + except (KeyError, FileNotFoundError): + raise HTTPException(status_code=404, detail=f"Zarr key '{relative_path}' not found in store") + + raw = bytes(data) + if key.endswith("zarr.json"): + return JSONResponse(content=json.loads(raw)) + return Response(content=raw, media_type="application/octet-stream") + + def _load_records() -> list[ArtifactRecord]: ensure_store() raw = json.loads(ARTIFACTS_INDEX_PATH.read_text(encoding="utf-8")) - return [ArtifactRecord.model_validate(_upgrade_legacy_record(item)) for item in raw] + return [ArtifactRecord.model_validate(item) for item in raw] def _save_records(records: list[ArtifactRecord]) -> None: @@ -547,7 +832,6 @@ def _save_records(records: list[ArtifactRecord]) -> None: def _store_artifact_record( record: ArtifactRecord, *, - prefer_zarr: bool, publish: bool, ) -> ArtifactRecord: """Persist a newly created artifact record while avoiding lost updates.""" @@ -557,7 +841,6 @@ def mutate(records: list[ArtifactRecord]) -> ArtifactRecord: records=records, dataset_id=record.dataset_id, request_scope=record.request_scope, - prefer_zarr=prefer_zarr, ) if existing is not None: if publish and existing.publication.status != PublicationStatus.PUBLISHED: @@ -570,23 +853,46 @@ def mutate(records: list[ArtifactRecord]) -> ArtifactRecord: return _mutate_records(mutate) +def _upsert_icechunk_artifact_record(record: ArtifactRecord) -> ArtifactRecord: + """Persist an Icechunk artifact record, replacing any existing record for the same store path. + + Matches by dataset_id + path rather than request_scope so that sync appends + (which extend the end date) update the existing record in-place instead of + accumulating duplicate entries for the same physical store. + """ + + def mutate(records: list[ArtifactRecord]) -> ArtifactRecord: + for i, existing in enumerate(records): + if existing.dataset_id == record.dataset_id and existing.asset_paths == record.asset_paths: + replacement = record.model_copy( + update={ + "artifact_id": existing.artifact_id, + "publication": existing.publication, + } + ) + records[i] = replacement + return replacement + records.append(record) + return record + + return _mutate_records(mutate) + + def _upsert_artifact_record( record: ArtifactRecord, *, - prefer_zarr: bool, publish: bool, overwrite: bool, ) -> ArtifactRecord: """Persist a new or replacement artifact record for the same logical request scope.""" if not overwrite: - return _store_artifact_record(record, prefer_zarr=prefer_zarr, publish=publish) + return _store_artifact_record(record, publish=publish) def mutate(records: list[ArtifactRecord]) -> ArtifactRecord: existing = _find_existing_artifact_in_records( records=records, dataset_id=record.dataset_id, request_scope=record.request_scope, - prefer_zarr=prefer_zarr, ) if existing is None: records.append(record) @@ -615,7 +921,7 @@ def _mutate_records(mutation: Callable[[list[ArtifactRecord]], ArtifactRecord]) portalocker.lock(handle, portalocker.LOCK_EX) handle.seek(0) raw = handle.read() - records = [ArtifactRecord.model_validate(_upgrade_legacy_record(item)) for item in json.loads(raw or "[]")] + records = [ArtifactRecord.model_validate(item) for item in json.loads(raw or "[]")] result = mutation(records) payload = [record.model_dump(mode="json") for record in records] handle.seek(0) @@ -632,7 +938,7 @@ def _get_zarr_root_or_409(artifact: ArtifactRecord) -> Path: if artifact.format != ArtifactFormat.ZARR: raise HTTPException(status_code=409, detail="Artifact is not a Zarr store") - store_root = Path(artifact.path or artifact.asset_paths[0]).resolve() + store_root = Path(artifact.asset_paths[0]).resolve() if not store_root.exists() or not store_root.is_dir(): raise HTTPException(status_code=404, detail="Zarr store path does not exist on disk") return store_root @@ -676,21 +982,19 @@ def _find_existing_artifact( *, dataset_id: str, request_scope: ArtifactRequestScope, - prefer_zarr: bool, ) -> ArtifactRecord | None: """Return an existing artifact for an identical logical request when possible.""" return _find_existing_artifact_in_records( records=_load_records(), dataset_id=dataset_id, request_scope=request_scope, - prefer_zarr=prefer_zarr, ) -def _normalize_request_period(value: str, *, period_type: str, field_name: str) -> str: +def _normalize_request_period(value: str, *, period_type: str, field_name: str, is_end: bool = False) -> str: """Normalize a required request period or raise a clear client error.""" try: - return normalize_period_string(value, period_type) + return normalize_period_string(value, period_type, is_end=is_end) except (TypeError, ValueError) as exc: raise HTTPException( status_code=400, @@ -698,18 +1002,20 @@ def _normalize_request_period(value: str, *, period_type: str, field_name: str) ) from exc -def _normalize_optional_request_period(value: str | None, *, period_type: str, field_name: str) -> str | None: +def _normalize_optional_request_period( + value: str | None, *, period_type: str, field_name: str, is_end: bool = False +) -> str | None: """Normalize an optional request period or raise a clear client error.""" if value is None: return None - return _normalize_request_period(value, period_type=period_type, field_name=field_name) + return _normalize_request_period(value, period_type=period_type, field_name=field_name, is_end=is_end) def _default_request_end(period_type: str) -> str: """Return the current dataset-native period string for omitted ingestion end values.""" if period_type == "hourly": return datetime_to_period_string(utc_now(), period_type) - if period_type == "daily": + if period_type in ("daily", "dekadal"): return utc_today().isoformat() if period_type == "weekly": return datetime_to_period_string(utc_now(), period_type) @@ -749,7 +1055,6 @@ def _find_existing_artifact_in_records( records: list[ArtifactRecord], dataset_id: str, request_scope: ArtifactRequestScope, - prefer_zarr: bool, ) -> ArtifactRecord | None: """Return an existing artifact for an identical logical request from a provided record set.""" for record in reversed(records): @@ -773,8 +1078,6 @@ def _find_existing_artifact_in_records( record.request_scope.end, ) continue - if prefer_zarr and record.format != ArtifactFormat.ZARR: - continue return record return None @@ -797,14 +1100,9 @@ def _materialized_records(records: list[ArtifactRecord]) -> list[ArtifactRecord] def _artifact_storage_exists(record: ArtifactRecord) -> bool: """Return whether an artifact's on-disk backing files are still present.""" - paths: list[str] = [] - if record.path is not None: - paths.append(record.path) - if record.asset_paths: - paths.extend(record.asset_paths) - if not paths: + if not record.asset_paths: return False - return all(Path(path).exists() for path in paths) + return all(Path(path).exists() for path in record.asset_paths) def _temporal_coverage_matches_request_scope( @@ -876,12 +1174,12 @@ def _dataset_links(dataset_id: str, latest: ArtifactRecord) -> list[DatasetAcces DatasetAccessLink(href=f"/datasets/{dataset_id}", rel="self", title="Dataset detail"), DatasetAccessLink(href=f"/zarr/{dataset_id}", rel="zarr", title="Zarr store"), ] - if latest.publication.status == PublicationStatus.PUBLISHED and latest.format == ArtifactFormat.ZARR: + is_published_store = latest.publication.status == PublicationStatus.PUBLISHED and latest.format in { + ArtifactFormat.ZARR, + ArtifactFormat.ICECHUNK, + } + if is_published_store: links.append(DatasetAccessLink(href=f"/stac/collections/{dataset_id}", rel="stac", title="STAC collection")) - if latest.format == ArtifactFormat.NETCDF: - links.append( - DatasetAccessLink(href=f"/datasets/{dataset_id}/download", rel="download", title="Download NetCDF") - ) if latest.publication.pygeoapi_path is not None: links.append( DatasetAccessLink(href=latest.publication.pygeoapi_path, rel="ogc-collection", title="OGC collection") @@ -891,42 +1189,3 @@ def _dataset_links(dataset_id: str, latest: ArtifactRecord) -> list[DatasetAcces def _as_optional_str(value: object) -> str | None: return value if isinstance(value, str) else None - - -def _upgrade_legacy_record(item: dict[str, object]) -> dict[str, object]: - """Backfill newer schema fields for records created before migrations existed.""" - if "request_scope" not in item: - coverage = item.get("coverage") - if isinstance(coverage, dict): - spatial = coverage.get("spatial") - temporal = coverage.get("temporal") - bbox: tuple[float, float, float, float] | None = None - if isinstance(spatial, dict): - xmin = spatial.get("xmin") - ymin = spatial.get("ymin") - xmax = spatial.get("xmax") - ymax = spatial.get("ymax") - if ( - isinstance(xmin, int | float) - and isinstance(ymin, int | float) - and isinstance(xmax, int | float) - and isinstance(ymax, int | float) - ): - bbox = (float(xmin), float(ymin), float(xmax), float(ymax)) - - start = "" - end: str | None = None - if isinstance(temporal, dict): - raw_start = temporal.get("start") - raw_end = temporal.get("end") - if isinstance(raw_start, str): - start = raw_start - if isinstance(raw_end, str): - end = raw_end - - item["request_scope"] = { - "start": start, - "end": end, - "bbox": bbox, - } - return item diff --git a/climate_api/ingestions/sync_engine.py b/climate_api/ingestions/sync_engine.py index 1685065f..451eb6f4 100644 --- a/climate_api/ingestions/sync_engine.py +++ b/climate_api/ingestions/sync_engine.py @@ -11,15 +11,12 @@ from __future__ import annotations -import importlib -import inspect import logging from collections.abc import Callable from datetime import date, datetime, time, timedelta from typing import Any from climate_api.ingestions.schemas import ArtifactRecord, SyncAction, SyncDetail, SyncKind, SyncResponse -from climate_api.providers import availability as provider_availability from climate_api.publications.services import managed_dataset_id_for from climate_api.shared.time import ( datetime_to_period_string, @@ -42,6 +39,7 @@ def plan_sync( source_dataset: dict[str, Any], latest_artifact: ArtifactRecord, requested_end: str | None, + current_end: str | None = None, ) -> SyncDetail: """Return the sync decision for one managed dataset without changing local state. @@ -54,6 +52,10 @@ def plan_sync( - release datasets compare the current materialized release against the requested end - static datasets are marked as not syncable + `current_end` overrides `latest_artifact.coverage.temporal.end` when provided. + Callers pass the store-authoritative value for formats (e.g. Icechunk) where the + artifact metadata record may lag behind what is actually committed on disk. + This planner deliberately does not download data or persist artifacts. """ sync_kind_value = source_dataset.get("sync", {}).get("kind") @@ -61,7 +63,7 @@ def plan_sync( raise ValueError("source_dataset must define sync.kind for sync planning") sync_kind = SyncKind(sync_kind_value) current_start = latest_artifact.request_scope.start - current_end = latest_artifact.coverage.temporal.end + current_end = current_end if current_end is not None else latest_artifact.coverage.temporal.end if sync_kind == SyncKind.STATIC: return SyncDetail( @@ -75,6 +77,8 @@ def plan_sync( target_end=current_end, target_end_source="current_coverage", ) + if current_end is None: + raise ValueError(f"Cannot plan sync for {sync_kind.value} dataset with no existing temporal coverage") period_type = str(source_dataset["period_type"]) normalized_requested_end = requested_end.strip() if isinstance(requested_end, str) else None normalized_requested_end = normalized_requested_end or None @@ -83,7 +87,9 @@ def plan_sync( resolved_end = normalize_period_string(normalized_requested_end, period_type) else: resolved_end = _default_target_end(period_type=period_type) - latest_available_end = _latest_available_end(source_dataset=source_dataset, requested_end=resolved_end) + latest_available_end = _latest_available_end( + source_dataset=source_dataset, requested_end=resolved_end, current_end=current_end + ) target_end_source = ( requested_target_end_source if latest_available_end == resolved_end @@ -163,11 +169,11 @@ def run_sync( latest_artifact: ArtifactRecord, source_dataset: dict[str, Any], requested_end: str | None, - country_code: str | None, - prefer_zarr: bool, publish: bool, create_artifact_fn: Callable[..., ArtifactRecord], get_dataset_fn: Callable[[str], Any], + current_end: str | None = None, + on_progress: Any | None = None, ) -> SyncResponse: """Plan and execute one sync operation for a managed dataset. @@ -183,6 +189,7 @@ def run_sync( source_dataset=source_dataset, latest_artifact=latest_artifact, requested_end=requested_end, + current_end=current_end, ) dataset_id = managed_dataset_id_for(latest_artifact) logger.info( @@ -242,10 +249,9 @@ def run_sync( download_start=download_start, download_end=sync_detail.delta_end if download_start is not None else None, bbox=list(latest_artifact.request_scope.bbox) if latest_artifact.request_scope.bbox is not None else None, - country_code=country_code, overwrite=False, - prefer_zarr=prefer_zarr, publish=publish, + on_progress=on_progress, ) logger.info( "Sync completed for dataset '%s': artifact_id=%s action=%s", @@ -295,7 +301,7 @@ def _next_period_start(latest_period_end: str, *, period_type: str) -> str: if period_type == "hourly": timestamp = parse_hourly_period_string(latest_period_end) return datetime_to_period_string(timestamp + timedelta(hours=1), period_type) - if period_type == "daily": + if period_type in ("daily", "dekadal"): current = date.fromisoformat(latest_period_end) return (current + timedelta(days=1)).isoformat() if period_type == "weekly": @@ -318,7 +324,7 @@ def _default_target_end(*, period_type: str) -> str: today = utc_today() if period_type == "hourly": return datetime_to_period_string(utc_now(), period_type) - if period_type == "daily": + if period_type in ("daily", "dekadal"): return today.isoformat() if period_type == "weekly": return datetime_to_period_string(utc_now(), period_type) @@ -329,93 +335,127 @@ def _default_target_end(*, period_type: str) -> str: raise ValueError(f"Unsupported period_type '{period_type}' for sync") -def _latest_available_end(*, source_dataset: dict[str, Any], requested_end: str) -> str: - """Clamp requested sync end to the latest upstream state declared by template metadata. +def _latest_available_end(*, source_dataset: dict[str, Any], requested_end: str, current_end: str | None = None) -> str: + """Clamp requested sync end to the latest upstream state via the plugin's periods() method. - The current engine does not query upstream providers directly. Instead it can - apply conservative template metadata so sync planning does not overshoot known - provider lag or release cadence. + current_end must be provided so the function can return it when periods() reports nothing + new (empty list → NOOP detected by caller). """ - availability = source_dataset.get("sync", {}).get("availability") - if not isinstance(availability, dict): - return requested_end - - provider_latest = _provider_latest_available_end( - source_dataset=source_dataset, - availability=availability, - requested_end=requested_end, - ) - if provider_latest is not None: - return min(requested_end, provider_latest) - # Keep the legacy metadata-only lag fallback for templates that do not yet - # declare a latest_available_function, but delegate to the provider helper - # so lag logic lives in one place. - return min( - requested_end, - provider_availability.lagged_latest_available( - dataset=source_dataset, - requested_end=requested_end, - ), - ) - - -def _supports_append(source_dataset: dict[str, Any], latest_artifact: ArtifactRecord) -> bool: - """Return whether this template opts into V1 delta-download sync execution.""" - from pathlib import Path + period_type = source_dataset.get("period_type") + if current_end is not None and isinstance(period_type, str): + ingestion = source_dataset.get("ingestion") + if isinstance(ingestion, dict) and isinstance(ingestion.get("plugin"), str): + next_start = _next_period_start(current_end, period_type=period_type) + plugin_latest = _plugin_latest_available_period( + source_dataset=source_dataset, + next_period_start=next_start, + requested_end=requested_end, + current_end=current_end, + ) + if plugin_latest is not None: + return min(requested_end, plugin_latest) - if source_dataset.get("sync", {}).get("execution") != SyncAction.APPEND.value: - return False - # Pyramid zarr stores cannot be appended to — they must be rebuilt in full. - # Detect this from the existing artifact's on-disk structure rather than YAML. - artifact_path = latest_artifact.path - if artifact_path and "://" not in artifact_path and (Path(artifact_path) / "0").is_dir(): - logger.warning( - "Sync append execution is not supported for pyramid zarr dataset '%s'; falling back to rematerialize", - source_dataset.get("id", ""), - ) - return False - return True + return requested_end -def _provider_latest_available_end( +def _plugin_latest_available_period( *, source_dataset: dict[str, Any], - availability: dict[str, Any], + next_period_start: str, requested_end: str, + current_end: str, ) -> str | None: - """Call an optional provider-specific latest-availability function.""" - function_path = availability.get("latest_available_function") - if not isinstance(function_path, str) or not function_path: + """Return the last period available from next_period_start..requested_end via the plugin. + + Returns: + - str: the last available period in the range (may equal current_end when nothing new) + - None: plugin could not be instantiated (caller falls back to legacy availability logic) + + """ + ingestion = source_dataset.get("ingestion") + if not isinstance(ingestion, dict): + return None + plugin_path = ingestion.get("plugin") + if not isinstance(plugin_path, str): return None + _raw_params = ingestion.get("params") + params: dict[str, Any] = dict(_raw_params) if isinstance(_raw_params, dict) else {} + try: - latest_available_fn = _get_dynamic_function(function_path) - params: dict[str, Any] = {} - signature = inspect.signature(latest_available_fn) - if "dataset" in signature.parameters: - params["dataset"] = source_dataset - if "requested_end" in signature.parameters: - params["requested_end"] = requested_end - result = latest_available_fn(**params) - except (AttributeError, ImportError, TypeError, ValueError) as exc: - raise SyncConfigurationError(f"Latest availability function '{function_path}' failed: {exc}") from exc - if not isinstance(result, str): - raise SyncConfigurationError(f"Latest availability function '{function_path}' must return a period string") + from climate_api.ingest.orchestrator import load_plugin + + plugin = load_plugin(plugin_path, params) + except (TypeError, ValueError, ImportError, AttributeError) as exc: + logger.debug( + "Plugin '%s' cannot be instantiated for availability check (needs extra_params?): %s", + plugin_path, + exc, + ) + return None + try: - return normalize_period_string(result, period_type=str(source_dataset["period_type"])) - except (KeyError, TypeError, ValueError) as exc: - raise SyncConfigurationError( - f"Latest availability function '{function_path}' returned invalid period " - f"'{result}' for dataset period_type '{source_dataset.get('period_type')}'" - ) from exc - - -def _get_dynamic_function(full_path: str) -> Callable[..., Any]: - """Import and return a function given its dotted module path.""" - parts = full_path.split(".") - if len(parts) < 2 or any(not part for part in parts): - raise ValueError(f"Invalid dotted function path '{full_path}'") - module_path = ".".join(parts[:-1]) - function_name = parts[-1] - module = importlib.import_module(module_path) - return getattr(module, function_name) # type: ignore[no-any-return] + periods: list[str] = plugin.periods(next_period_start, requested_end) + except Exception as exc: + logger.debug("plugin.periods() failed during availability check for '%s': %s", plugin_path, exc) + return None + + return periods[-1] if periods else current_end + + +def _supports_append(source_dataset: dict[str, Any], latest_artifact: ArtifactRecord) -> bool: + """Return whether this artifact supports incremental append sync execution. + + Icechunk stores always support append: the orchestrator uses read_committed_period_ids + to determine exactly which periods are missing and commits only those. No YAML + sync.execution flag is required. + + For all other formats the YAML must opt in with sync.execution: append, and + pyramid zarr stores (identified by a "0/" subdirectory) are excluded because + they must be rebuilt in full. + """ + from climate_api.ingestions.schemas import ArtifactFormat + + if latest_artifact.format == ArtifactFormat.ICECHUNK: + # Pyramid Icechunk stores have data under group "0"; appending to root + # would create a second flat dataset instead of extending the pyramid. + # Fall back to rematerialize so the full pyramid is rebuilt. + if latest_artifact.asset_paths: + from pathlib import Path + + from climate_api.ingest.store import open_or_create_repo + + try: + import zarr + + icechunk_path = Path(latest_artifact.asset_paths[0]) + if icechunk_path.exists(): + repo = open_or_create_repo(icechunk_path) + session = repo.readonly_session("main") + root = zarr.open_group(session.store, mode="r") + if "multiscales" in root.attrs: + logger.warning( + "Sync append not supported for pyramid Icechunk dataset '%s'; " + "falling back to rematerialize", + source_dataset.get("id", ""), + ) + return False + except Exception: + pass # store unreadable — let the ingest path handle it + return True + + if source_dataset.get("sync", {}).get("execution") != SyncAction.APPEND.value: + return False + # Pyramid zarr stores cannot be appended to — they must be rebuilt in full. + # Detect this from the existing artifact's on-disk structure rather than YAML. + if latest_artifact.asset_paths: + from pathlib import Path + + artifact_path = latest_artifact.asset_paths[0] + if "://" not in artifact_path and (Path(artifact_path) / "0").is_dir(): + logger.warning( + "Sync append execution is not supported for pyramid zarr dataset '%s'; falling back to rematerialize", + source_dataset.get("id", ""), + ) + return False + return True diff --git a/climate_api/processing/resample.py b/climate_api/processing/resample.py index 62970b1e..bbca9467 100644 --- a/climate_api/processing/resample.py +++ b/climate_api/processing/resample.py @@ -15,7 +15,7 @@ import xarray as xr from fastapi import HTTPException -from climate_api.data_accessor.services.accessor import open_zarr_dataset +from climate_api.data_accessor.services.accessor import open_icechunk_dataset, open_zarr_dataset from climate_api.data_manager.services.utils import get_time_dim from climate_api.data_registry.services import datasets as registry_datasets from climate_api.ingestions import services as ingestion_services @@ -76,7 +76,6 @@ def materialize_resampled_artifact( existing = ingestion_services._find_existing_artifact( dataset_id=target_dataset_id, request_scope=ArtifactRequestScope(start=start, end=resolved_end), - prefer_zarr=True, ) if existing is not None and not overwrite: if publish and existing.publication.status != PublicationStatus.PUBLISHED: @@ -88,13 +87,17 @@ def materialize_resampled_artifact( raise HTTPException(status_code=404, detail=f"Source dataset template '{source_dataset_id}' not found") source_artifact = _resolve_source_artifact(source_dataset_id=source_dataset_id) - if source_artifact.format != ArtifactFormat.ZARR: - raise HTTPException(status_code=409, detail="Resampling currently requires a Zarr-backed source artifact") + if source_artifact.format not in {ArtifactFormat.ZARR, ArtifactFormat.ICECHUNK}: + raise HTTPException(status_code=409, detail="Resampling currently requires a Zarr or Icechunk source artifact") target_managed_dataset_id = managed_dataset_id_for_scope(target_dataset_id) zarr_path = DERIVED_DATA_DIR / f"{target_managed_dataset_id}.zarr" - source_ds = open_zarr_dataset(source_artifact.path or source_artifact.asset_paths[0]) + source_path = source_artifact.asset_paths[0] + if source_artifact.format == ArtifactFormat.ICECHUNK: + source_ds = open_icechunk_dataset(source_path) + else: + source_ds = open_zarr_dataset(source_path) try: resampled = _resample_dataset( source_ds=source_ds, @@ -234,7 +237,6 @@ def _find_existing_resampled_artifact( return ingestion_services._find_existing_artifact( dataset_id=target_dataset_id, request_scope=ArtifactRequestScope(start=start, end=realized_end), - prefer_zarr=True, ) diff --git a/climate_api/providers/__init__.py b/climate_api/providers/__init__.py deleted file mode 100644 index 179fb70c..00000000 --- a/climate_api/providers/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Provider-specific Climate API helpers.""" diff --git a/climate_api/providers/availability.py b/climate_api/providers/availability.py deleted file mode 100644 index 1969d767..00000000 --- a/climate_api/providers/availability.py +++ /dev/null @@ -1,88 +0,0 @@ -"""Provider availability policies used by sync planning. - -These functions keep source-specific release cadence rules out of the generic -sync engine. They are intentionally small and metadata-driven so dataset YAML can -choose the right policy per upstream provider. -""" - -from __future__ import annotations - -from calendar import monthrange -from datetime import date, timedelta -from typing import Any - -from climate_api.shared.time import datetime_to_period_string, utc_now, utc_today - - -def chirps3_daily_latest_available(*, dataset: dict[str, Any], requested_end: str) -> str: - """Return latest complete CHIRPS3 daily period available for safe sync. - - The dhis2eo CHIRPS3 downloader groups daily files by source month. For - final/rnl data, use only fully released months by default: after the 20th, - the previous month is considered available; otherwise the month before that - is the latest safe complete month. - """ - availability = _availability_metadata(dataset) - threshold_day = availability.get("complete_month_after_day", 20) - if not isinstance(threshold_day, int): - threshold_day = 20 - - today = utc_today() - months_back = 1 if today.day > threshold_day else 2 - available_month = _add_months(today.replace(day=1), -months_back) - latest_day = monthrange(available_month.year, available_month.month)[1] - return date(available_month.year, available_month.month, latest_day).isoformat() - - -def lagged_latest_available(*, dataset: dict[str, Any], requested_end: str) -> str: - """Return latest available period by applying YAML-declared lag metadata.""" - availability = _availability_metadata(dataset) - period_type = str(dataset.get("period_type", "daily")) - - if period_type == "hourly": - lag_hours = availability.get("lag_hours") - if isinstance(lag_hours, int) and lag_hours > 0: - latest = utc_now() - timedelta(hours=lag_hours) - return datetime_to_period_string(latest, period_type) - return requested_end - - lag_days = availability.get("lag_days") - if period_type in {"daily", "monthly"} and isinstance(lag_days, int) and lag_days > 0: - latest_date = utc_today() - timedelta(days=lag_days) - if period_type == "monthly": - return f"{latest_date.year:04d}-{latest_date.month:02d}" - return latest_date.isoformat() - - if period_type == "yearly": - latest_year_offset = availability.get("latest_year_offset") - if isinstance(latest_year_offset, int) and latest_year_offset >= 0: - return str(utc_today().year - latest_year_offset) - - return requested_end - - -def worldpop_release_latest_available(*, dataset: dict[str, Any], requested_end: str) -> str: - """Return WorldPop release availability, including configured projections.""" - availability = _availability_metadata(dataset) - if availability.get("allow_future") is True: - return requested_end - - latest_year = availability.get("latest_year") - if isinstance(latest_year, int): - return str(latest_year) - - return lagged_latest_available(dataset=dataset, requested_end=requested_end) - - -def _availability_metadata(dataset: dict[str, Any]) -> dict[str, Any]: - """Return sync availability metadata from a dataset template.""" - availability = dataset.get("sync", {}).get("availability") - return availability if isinstance(availability, dict) else {} - - -def _add_months(value: date, offset: int) -> date: - """Add a month offset to the first day of a month.""" - month_index = value.year * 12 + value.month - 1 + offset - year = month_index // 12 - month = month_index % 12 + 1 - return date(year, month, 1) diff --git a/climate_api/publications/services.py b/climate_api/publications/services.py index 6451e177..e7444af2 100644 --- a/climate_api/publications/services.py +++ b/climate_api/publications/services.py @@ -12,7 +12,7 @@ import xarray as xr import yaml -from climate_api.data_accessor.services.accessor import open_zarr_dataset +from climate_api.data_accessor.services.accessor import open_icechunk_dataset, open_zarr_dataset from climate_api.data_manager.services.utils import get_time_dim, get_x_y_dims from climate_api.ingestions.schemas import ArtifactFormat, ArtifactRecord, PublicationStatus @@ -47,8 +47,9 @@ def publish_artifact(record: ArtifactRecord) -> ArtifactRecord: from climate_api.ingestions.services import list_artifacts collection_id = managed_dataset_id_for(record) - data_path = record.path or record.asset_paths[0] + data_path = record.asset_paths[0] is_pyramid_zarr = record.format == ArtifactFormat.ZARR and (Path(data_path) / "0").is_dir() + is_icechunk = record.format == ArtifactFormat.ICECHUNK published_record = record.model_copy( update={ "publication": record.publication.model_copy( @@ -56,8 +57,10 @@ def publish_artifact(record: ArtifactRecord) -> ArtifactRecord: "status": PublicationStatus.PUBLISHED, "collection_id": collection_id, "published_at": datetime.now(UTC), - # Pyramid zarr stores are served via the /zarr endpoint, not pygeoapi. - "pygeoapi_path": None if is_pyramid_zarr else f"/ogcapi/collections/{collection_id}", + # Pyramid zarr and Icechunk stores are served via the /zarr endpoint, not pygeoapi. + "pygeoapi_path": None + if (is_pyramid_zarr or is_icechunk) + else f"/ogcapi/collections/{collection_id}", } ) } @@ -68,7 +71,9 @@ def publish_artifact(record: ArtifactRecord) -> ArtifactRecord: active = published_record if artifact.artifact_id == record.artifact_id else artifact if active.publication.status != PublicationStatus.PUBLISHED: continue - data_path = active.path or active.asset_paths[0] + data_path = active.asset_paths[0] + if active.format == ArtifactFormat.ICECHUNK: + continue # icechunk: not served via pygeoapi, use /zarr endpoint instead if active.format == ArtifactFormat.ZARR and (Path(data_path) / "0").is_dir(): continue # pyramid zarr: not served via pygeoapi, use /zarr endpoint instead assert active.publication.collection_id is not None @@ -115,7 +120,7 @@ def _build_collection_resource(record: ArtifactRecord) -> dict[str, Any]: provider: dict[str, Any] = { "type": "coverage", "name": "xarray", - "data": record.path or record.asset_paths[0], + "data": record.asset_paths[0], "x_field": x_field, "y_field": y_field, "time_field": time_field, @@ -159,8 +164,10 @@ def _provider_format(artifact_format: ArtifactFormat) -> dict[str, str]: def _provider_axes(record: ArtifactRecord) -> tuple[str, str, str]: """Inspect an artifact and return provider axis field names.""" - data_path = record.path or record.asset_paths[0] - if record.format == ArtifactFormat.ZARR: + data_path = record.asset_paths[0] + if record.format == ArtifactFormat.ICECHUNK: + ds = open_icechunk_dataset(data_path) + elif record.format == ArtifactFormat.ZARR: ds = open_zarr_dataset(data_path) else: ds = xr.open_dataset(data_path) diff --git a/climate_api/shared/time.py b/climate_api/shared/time.py index 922f65fe..fdd05965 100644 --- a/climate_api/shared/time.py +++ b/climate_api/shared/time.py @@ -93,7 +93,7 @@ def datetime_to_period_string(value: datetime, period_type: str) -> str: value = _normalize_datetime_for_period(value) if period_type == "hourly": return value.replace(minute=0, second=0, microsecond=0).strftime("%Y-%m-%dT%H") - if period_type == "daily": + if period_type in ("daily", "dekadal"): return value.date().isoformat() if period_type == "weekly": iso_year, iso_week, _ = value.isocalendar() @@ -132,18 +132,27 @@ def parse_weekly_period_string(value: str) -> datetime: return datetime.fromisoformat(value) -def normalize_period_string(value: str, period_type: str) -> str: - """Normalize an input period string to the dataset-native period format.""" +def normalize_period_string(value: str, period_type: str, *, is_end: bool = False) -> str: + """Normalize an input period string to the dataset-native period format. + + When is_end=True and period_type='hourly', a date-only input (YYYY-MM-DD) + is treated as the last hour of that day (T23) rather than T00. + """ if period_type == "hourly": try: - return datetime_to_period_string(parse_hourly_period_string(value), period_type) + dt = parse_hourly_period_string(value) + # A bare date with no time component defaults to midnight; for an end + # bound that means the user intended the last hour of the day. + if is_end and len(value) == 10: + dt = dt.replace(hour=23) + return datetime_to_period_string(dt, period_type) except ValueError as exc: raise ValueError(f"Invalid hourly period '{value}'; expected YYYY-MM-DDTHH or ISO datetime") from exc - if period_type == "daily": + if period_type in ("daily", "dekadal"): try: return datetime_to_period_string(datetime.fromisoformat(value), period_type) except ValueError as exc: - raise ValueError(f"Invalid daily period '{value}'; expected YYYY-MM-DD or ISO datetime") from exc + raise ValueError(f"Invalid {period_type} period '{value}'; expected YYYY-MM-DD or ISO datetime") from exc if period_type == "weekly": try: return datetime_to_period_string(parse_weekly_period_string(value), period_type) @@ -190,7 +199,7 @@ def parse_period_string_to_datetime(value: str) -> datetime: def numpy_datetime_to_period_string(datetimes: np.ndarray[Any, Any], period_type: str) -> np.ndarray[Any, Any]: """Convert an array of numpy datetimes to truncated period strings.""" if period_type != "weekly": - lengths = {"hourly": 13, "daily": 10, "monthly": 7, "yearly": 4} + lengths = {"hourly": 13, "daily": 10, "dekadal": 10, "monthly": 7, "yearly": 4} return np.datetime_as_string(datetimes, unit="s").astype(f"U{lengths[period_type]}") dt_index = pd.DatetimeIndex(np.atleast_1d(np.asarray(datetimes, dtype="datetime64[ns]"))) diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index c8801196..fa00abf9 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -13,7 +13,7 @@ from fastapi import HTTPException, Request from xstac import xarray_to_stac -from climate_api.data_accessor.services.accessor import open_zarr_dataset +from climate_api.data_accessor.services.accessor import open_icechunk_dataset, open_zarr_dataset from climate_api.data_manager.services.utils import get_time_dim, get_x_y_dims from climate_api.data_registry.services import datasets as registry_datasets from climate_api.ingestions import services as ingestion_services @@ -131,7 +131,7 @@ def _eligible_artifacts_by_dataset() -> dict[str, ArtifactRecord]: latest = max(artifacts, key=lambda artifact: artifact.created_at) if latest.publication.status != PublicationStatus.PUBLISHED: continue - if latest.format != ArtifactFormat.ZARR: + if latest.format not in (ArtifactFormat.ZARR, ArtifactFormat.ICECHUNK): continue result[dataset_id] = latest return dict(sorted(result.items())) @@ -155,7 +155,12 @@ def _build_collection_template( extent=pystac.Extent( spatial=pystac.SpatialExtent([[spatial.xmin, spatial.ymin, spatial.xmax, spatial.ymax]]), temporal=pystac.TemporalExtent( - [[parse_period_string_to_datetime(temporal.start), parse_period_string_to_datetime(temporal.end)]] + [ + [ + parse_period_string_to_datetime(temporal.start) if temporal.start else None, + parse_period_string_to_datetime(temporal.end) if temporal.end else None, + ] + ] ), ), title=artifact.dataset_name, @@ -196,7 +201,11 @@ def _build_collection_with_xstac(*, artifact: ArtifactRecord, template: pystac.C return deepcopy(cached_payload) try: - ds = open_zarr_dataset(_artifact_store_path(artifact)) + store_path = _artifact_store_path(artifact) + if artifact.format == ArtifactFormat.ICECHUNK: + ds = open_icechunk_dataset(store_path) + else: + ds = open_zarr_dataset(store_path) except HTTPException: raise except Exception as exc: @@ -207,14 +216,42 @@ def _build_collection_with_xstac(*, artifact: ArtifactRecord, template: pystac.C ) from exc try: x_dimension, y_dimension = get_x_y_dims(ds) - time_dimension = get_time_dim(ds) + try: + time_dimension = get_time_dim(ds) + except ValueError: + time_dimension = None + # Detect the actual data CRS so proj:code reflects the store's native coordinate + # system rather than the deployment CRS. This matters when a dataset (e.g. WorldPop) + # is stored in WGS84 while the deployment is configured for a projected CRS. + detected_crs = _detect_dataset_crs(ds) + if detected_crs: + template.extra_fields["proj:code"] = detected_crs + try: + reference_system = int(detected_crs.split(":")[-1]) if detected_crs else 4326 + except ValueError: + reference_system = 4326 + if time_dimension is None: + # xstac requires a temporal dimension; skip it for timeless (static) + # stores and build only spatial cube:dimensions by hand. + payload = template.to_dict(include_self_link=False) + payload["cube:dimensions"] = { + x_dimension: {"type": "spatial", "axis": "x", "reference_system": reference_system}, + y_dimension: {"type": "spatial", "axis": "y", "reference_system": reference_system}, + } + _cache_xstac_collection_payload(artifact.artifact_id, payload) + return deepcopy(payload) + + # xstac crashes on a scalar (0-d) time coordinate when computing + # min/max for the temporal extent. Expand to a 1-element array first. + if hasattr(ds, "coords") and time_dimension in ds.coords and ds[time_dimension].ndim == 0: + ds = ds.expand_dims(time_dimension) result = xarray_to_stac( ds, template, temporal_dimension=time_dimension, x_dimension=x_dimension, y_dimension=y_dimension, - reference_system=4326, + reference_system=reference_system, # Schema validation can trigger outbound fetches for STAC extension schemas. validate=False, ) @@ -222,7 +259,7 @@ def _build_collection_with_xstac(*, artifact: ArtifactRecord, template: pystac.C # clear xstac/pystac-owned links before serialization to avoid root-link # resolution attempts during to_dict(). result.clear_links() - payload: dict[str, Any] = result.to_dict(include_self_link=False) + payload = result.to_dict(include_self_link=False) _cache_xstac_collection_payload(artifact.artifact_id, payload) return deepcopy(payload) except HTTPException: @@ -247,10 +284,6 @@ def _cache_xstac_collection_payload(artifact_id: str, payload: dict[str, Any]) - _xstac_collection_cache[artifact_id] = deepcopy(payload) -def _clear_xstac_collection_cache() -> None: - _xstac_collection_cache.clear() - - def _link_to_dict(link: pystac.Link) -> dict[str, Any]: target = link.target href = target if isinstance(target, str) else link.href @@ -282,8 +315,6 @@ def _required_zarr_asset(template: pystac.Collection) -> pystac.Asset: def _artifact_store_path(artifact: ArtifactRecord) -> str: - if artifact.path: - return artifact.path if artifact.asset_paths: return artifact.asset_paths[0] raise HTTPException( @@ -298,9 +329,6 @@ def _public_zarr_asset_href( artifact: ArtifactRecord, source_dataset: dict[str, Any], ) -> str: - artifact_path = _artifact_store_path(artifact) - if _is_pyramid_zarr(artifact_path): - return _abs_url(request, f"/zarr/{dataset_id}/0") return _abs_url(request, f"/zarr/{dataset_id}") @@ -308,7 +336,22 @@ def _is_pyramid_zarr(artifact_path: str) -> bool: """Return True if artifact_path is a multiscale pyramid zarr store.""" if "://" in artifact_path: return False - return (Path(artifact_path) / "0").is_dir() + path = Path(artifact_path) + if (path / "0").is_dir(): + return True + if path.suffix == ".icechunk": + try: + import zarr + + from climate_api.ingest.store import open_or_create_repo + + repo = open_or_create_repo(path) + session = repo.readonly_session("main") + root = zarr.open_group(session.store, mode="r") + return "0" in root + except Exception: + return False + return False def _abs_url(request: Request, path: str) -> str: @@ -349,14 +392,13 @@ def _override_spatial_extent_from_artifact(collection: dict[str, Any], artifact: def _override_temporal_extent_from_artifact(collection: dict[str, Any], artifact: ArtifactRecord) -> None: temporal = artifact.coverage.temporal - start = parse_period_string_to_datetime(temporal.start).isoformat().replace("+00:00", "Z") - end = parse_period_string_to_datetime(temporal.end).isoformat().replace("+00:00", "Z") - collection["extent"]["temporal"]["interval"] = [ - [ - start, - end, - ] - ] + + def _fmt(period: str | None) -> str | None: + return parse_period_string_to_datetime(period).isoformat().replace("+00:00", "Z") if period else None + + start = _fmt(temporal.start) + end = _fmt(temporal.end) + collection["extent"]["temporal"]["interval"] = [[start, end]] dimensions = collection.setdefault("cube:dimensions", {}) for key, value in dimensions.items(): if isinstance(value, dict) and value.get("type") == "temporal": @@ -411,6 +453,9 @@ def _keywords(artifact: ArtifactRecord, source_dataset: dict[str, Any]) -> list[ def _zarr_asset_metadata(artifact: ArtifactRecord) -> dict[str, object]: metadata: dict[str, object] = {"zarr:node_type": "group"} + if artifact.format == ArtifactFormat.ICECHUNK: + metadata["zarr:zarr_format"] = 3 + return metadata artifact_path = _artifact_store_path(artifact) consolidated = _zarr_consolidated_flag(artifact_path) if consolidated is not None: @@ -428,8 +473,31 @@ def _zarr_asset_metadata(artifact: ArtifactRecord) -> dict[str, object]: return metadata -def _zarr_open_kwargs(artifact: ArtifactRecord) -> dict[str, bool | None]: - return {"consolidated": _zarr_consolidated_flag(_artifact_store_path(artifact))} +def _zarr_open_kwargs(artifact: ArtifactRecord) -> dict[str, object]: + artifact_path = _artifact_store_path(artifact) + if artifact.format == ArtifactFormat.ICECHUNK: + # Icechunk stores served over HTTP must use consolidated=False so that + # xarray reads zarr.json metadata directly rather than attempting HTTP + # directory listings (which our endpoint doesn't support). + # Pyramid stores also need group="0" — the root URL is exposed for + # zarr-layer zoom selection but data variables live under group "0". + kwargs: dict[str, object] = {"consolidated": False} + try: + import zarr + + from climate_api.ingest.store import open_or_create_repo + + icechunk_path = Path(artifact_path) + if icechunk_path.exists(): + repo = open_or_create_repo(icechunk_path) + session = repo.readonly_session("main") + root = zarr.open_group(session.store, mode="r") + if "multiscales" in root.attrs: + kwargs["group"] = "0" + except Exception: + pass + return kwargs + return {"consolidated": _zarr_consolidated_flag(artifact_path)} def _build_renders(artifact: ArtifactRecord, source_dataset: dict[str, Any]) -> dict[str, Any] | None: @@ -438,15 +506,21 @@ def _build_renders(artifact: ArtifactRecord, source_dataset: dict[str, Any]) -> return None colormap_name = display.get("colormap") value_range = display.get("range") - if not isinstance(colormap_name, str) or not isinstance(value_range, list) or len(value_range) != 2: - return None + palette = display.get("palette") + render: dict[str, Any] = { "title": artifact.dataset_name, "assets": ["zarr"], - "rescale": [[float(value_range[0]), float(value_range[1])]], - "colormap_name": colormap_name, "climate_api:variable": artifact.variable, } + + if isinstance(palette, dict) and palette: + render["climate_api:palette"] = {str(k): str(v) for k, v in palette.items()} + elif isinstance(colormap_name, str) and isinstance(value_range, list) and len(value_range) == 2: + render["colormap_name"] = colormap_name + render["rescale"] = [[float(value_range[0]), float(value_range[1])]] + else: + return None nodata = display.get("nodata") if nodata is not None: render["nodata"] = float(nodata) @@ -474,3 +548,38 @@ def _zarr_consolidated_flag(artifact_path: str) -> bool | None: if (store_root / ".zgroup").exists(): return False return None + + +def _detect_dataset_crs(ds: Any) -> str | None: + """Read the EPSG CRS code from a dataset, or None if undetectable. + + Checks (in order): spatial_ref WKT coordinate, then dimension units/standard_name. + Used to override the deployment-wide proj:code with the actual native CRS of the + data so that datasets stored in WGS84 (e.g. ERA5-Land, WorldPop) are not + misidentified as projected. + """ + if not hasattr(ds, "coords"): + return None + if "spatial_ref" in ds.coords: + try: + import pyproj + + attrs = dict(ds["spatial_ref"].attrs) + wkt = attrs.get("crs_wkt") or attrs.get("spatial_ref") + if wkt: + crs = pyproj.CRS.from_wkt(str(wkt)) + epsg = crs.to_epsg() + if epsg: + return f"EPSG:{epsg}" + except Exception: + pass + for dim in set(getattr(ds, "dims", {})): + if dim not in ds.coords: + continue + attrs = dict(ds[dim].attrs) + if attrs.get("units") in ("degrees_east", "degrees_north") or attrs.get("standard_name") in ( + "longitude", + "latitude", + ): + return "EPSG:4326" + return None diff --git a/climate_api/system/routes.py b/climate_api/system/routes.py index 3ac2d93f..fa7f8992 100644 --- a/climate_api/system/routes.py +++ b/climate_api/system/routes.py @@ -1,11 +1,14 @@ """Root API endpoints.""" +import asyncio +import json import sys import urllib.parse from importlib.metadata import version as _pkg_version +from typing import Any, AsyncGenerator from fastapi import APIRouter, Request -from fastapi.responses import HTMLResponse, JSONResponse, Response +from fastapi.responses import HTMLResponse, JSONResponse, Response, StreamingResponse from starlette.responses import RedirectResponse from .schemas import AppInfo, HealthStatus, Status @@ -14,6 +17,33 @@ router = APIRouter() +_SSE_HEADERS = { + "Cache-Control": "no-cache", + "X-Accel-Buffering": "no", + "Connection": "keep-alive", +} + + +async def _sse_stream( + task: asyncio.Task[None], + queue: asyncio.Queue[dict[str, Any] | None], +) -> AsyncGenerator[str, None]: + """Yield SSE events from queue until the task sentinel (None) arrives. + + Sends a keepalive comment every 5 seconds so the connection is not + dropped and partial response buffers are flushed by the browser. + """ + while True: + try: + event = await asyncio.wait_for(queue.get(), timeout=5.0) + except asyncio.TimeoutError: + yield ": keepalive\n\n" + continue + if event is None: + break + yield f"data: {json.dumps(event)}\n\n" + + @router.get("/", response_class=Response, responses=ROOT_RESPONSES) def read_index(request: Request) -> Response: """Return the landing page (HTML) or a navigation object (JSON with ?f=json).""" @@ -42,73 +72,104 @@ def manage( @router.post("/manage/ingest", include_in_schema=False) -async def manage_ingest(request: Request) -> RedirectResponse: - """Handle ingest form submission and redirect to the management page.""" - from fastapi import HTTPException - +async def manage_ingest(request: Request) -> Response: + """Stream ingest progress as SSE, then signal redirect on completion.""" from climate_api.data_registry.services.datasets import get_dataset from climate_api.extents.services import get_extent_or_404 from climate_api.ingestions.services import create_artifact base = str(request.base_url).rstrip("/") - try: - form = await request.form() - dataset_id = str(form.get("dataset_id", "")) - start = str(form.get("start", "")) - end = str(form.get("end", "")) or None - publish = "publish" in form - overwrite = "overwrite" in form - - template = get_dataset(dataset_id) - if template is None: - msg = urllib.parse.quote(f"Dataset template '{dataset_id}' not found") - return RedirectResponse(f"{base}/manage?error={msg}", status_code=303) - - extent = get_extent_or_404() - resolved_bbox = list(extent["bbox"]) - country_code = extent.get("country_code") - - create_artifact( - dataset=template, - start=start, - end=end, - bbox=resolved_bbox, - country_code=country_code, - overwrite=overwrite, - prefer_zarr=True, - publish=publish, - ) - name = urllib.parse.quote(template.get("name", dataset_id)) - return RedirectResponse(f"{base}/manage?message=Ingested+{name}", status_code=303) - except HTTPException as exc: - msg = urllib.parse.quote(str(exc.detail)) - return RedirectResponse(f"{base}/manage?error={msg}", status_code=303) - except Exception as exc: - msg = urllib.parse.quote(str(exc)) + form = await request.form() + dataset_id = str(form.get("dataset_id", "")) + start = str(form.get("start", "")) + end = str(form.get("end", "")) or None + publish = "publish" in form + overwrite = "overwrite" in form + + template = get_dataset(dataset_id) + if template is None: + msg = urllib.parse.quote(f"Dataset template '{dataset_id}' not found") return RedirectResponse(f"{base}/manage?error={msg}", status_code=303) + extent = get_extent_or_404() + resolved_bbox = list(extent["bbox"]) + + queue: asyncio.Queue[dict[str, Any] | None] = asyncio.Queue() + loop = asyncio.get_event_loop() + error_holder: list[str] = [] + + def on_progress(done: int, total: int, message: str = "") -> None: + loop.call_soon_threadsafe(queue.put_nowait, {"done": done, "total": total, "message": message}) + + async def run() -> None: + try: + await asyncio.to_thread( + create_artifact, + dataset=template, + start=start, + end=end, + bbox=resolved_bbox, + overwrite=overwrite, + publish=publish, + on_progress=on_progress, + ) + except Exception as exc: + error_holder.append(str(exc)) + finally: + await queue.put(None) + + task = asyncio.create_task(run()) + + async def event_stream() -> AsyncGenerator[str, None]: + async for chunk in _sse_stream(task, queue): + yield chunk + if error_holder: + yield f"data: {json.dumps({'error': error_holder[0]})}\n\n" + else: + name = urllib.parse.quote(str(template.get("name", dataset_id))) + yield f"data: {json.dumps({'redirect': f'{base}/manage?message=Ingested+{name}'})}\n\n" + + return StreamingResponse(event_stream(), media_type="text/event-stream", headers=_SSE_HEADERS) -@router.post("/manage/sync", include_in_schema=False) -async def manage_sync(request: Request) -> RedirectResponse: - """Handle sync form submission and redirect to the management page.""" - from fastapi import HTTPException +@router.post("/manage/sync", include_in_schema=False) +async def manage_sync(request: Request) -> Response: + """Stream sync progress as SSE, then signal redirect on completion.""" from climate_api.ingestions.services import sync_dataset base = str(request.base_url).rstrip("/") - try: - form = await request.form() - dataset_id = str(form.get("dataset_id", "")) - publish = "publish" in form - - sync_dataset(dataset_id=dataset_id, end=None, prefer_zarr=True, publish=publish) - return RedirectResponse(f"{base}/manage?message=Sync+completed", status_code=303) - except HTTPException as exc: - msg = urllib.parse.quote(str(exc.detail)) - return RedirectResponse(f"{base}/manage?error={msg}", status_code=303) - except Exception as exc: - msg = urllib.parse.quote(str(exc)) - return RedirectResponse(f"{base}/manage?error={msg}", status_code=303) + form = await request.form() + dataset_id = str(form.get("dataset_id", "")) + publish = "publish" in form + + queue: asyncio.Queue[dict[str, Any] | None] = asyncio.Queue() + loop = asyncio.get_event_loop() + error_holder: list[str] = [] + + def on_progress(done: int, total: int, message: str = "") -> None: + loop.call_soon_threadsafe(queue.put_nowait, {"done": done, "total": total, "message": message}) + + async def run() -> None: + try: + await asyncio.to_thread( + sync_dataset, dataset_id=dataset_id, end=None, publish=publish, on_progress=on_progress + ) + except Exception as exc: + error_holder.append(str(exc)) + finally: + await queue.put(None) + + task = asyncio.create_task(run()) + + async def event_stream() -> AsyncGenerator[str, None]: + async for chunk in _sse_stream(task, queue): + yield chunk + if error_holder: + yield f"data: {json.dumps({'error': error_holder[0]})}\n\n" + else: + yield f"data: {json.dumps({'redirect': f'{base}/manage?message=Sync+completed'})}\n\n" + + return StreamingResponse(event_stream(), media_type="text/event-stream", headers=_SSE_HEADERS) @router.get("/health") diff --git a/climate_api/templates/manage.html b/climate_api/templates/manage.html index bcd82140..1704b87b 100644 --- a/climate_api/templates/manage.html +++ b/climate_api/templates/manage.html @@ -250,6 +250,41 @@ background: #f1f5f9; border-color: #94a3b8; } + .sync-btn:disabled { + opacity: 0.5; + cursor: not-allowed; + } + + .progress-area { + margin-top: 1rem; + display: none; + } + .progress-area.active { + display: block; + } + .progress-bar-track { + background: #e2e8f0; + border-radius: 99px; + height: 6px; + overflow: hidden; + margin-bottom: 0.4rem; + } + .progress-bar-fill { + background: #0066cc; + height: 100%; + width: 0%; + border-radius: 99px; + transition: width 0.3s ease; + } + .progress-text { + font-size: 0.78rem; + color: #64748b; + } + + button[type="submit"]:disabled { + opacity: 0.6; + cursor: not-allowed; + } .divider { border: none; @@ -332,15 +367,6 @@

Ingest dataset

/> -
- - -
-
@@ -355,8 +381,12 @@

Ingest dataset


- - Ingestion runs synchronously and may take several minutes. + + This may take several minutes. +
+
+
+
{% endif %} @@ -395,8 +425,12 @@

Ingested datasets {{ datasets | length }}

- +
+
+
+
+
{% endfor %} @@ -413,5 +447,96 @@

Ingested datasets {{ datasets | length }}

+ + diff --git a/climate_api/templates/map-viewer.html b/climate_api/templates/map-viewer.html index 3c164769..cb109c68 100644 --- a/climate_api/templates/map-viewer.html +++ b/climate_api/templates/map-viewer.html @@ -255,6 +255,15 @@

{{ name }}

} } + function buildPaletteLut(palette) { + const lut = new Array(256).fill("#000000"); + for (const [value, color] of Object.entries(palette)) { + const idx = parseInt(value, 10); + if (idx >= 0 && idx < 256) lut[idx] = color; + } + return lut; + } + // Resolve the temporal dimension key and step list from cube:dimensions. function getTimeDimKey(dimensions) { for (const [key, val] of Object.entries(dimensions ?? {})) { @@ -339,8 +348,8 @@

{{ name }}

(_, i) => cm[Math.round((i * (cm.length - 1)) / 31)] ); legendBar.style.background = `linear-gradient(to right, ${stops.join(", ")})`; - legendMin.textContent = clim[0]; - legendMax.textContent = clim[1]; + legendMin.textContent = clim ? clim[0] : ""; + legendMax.textContent = clim ? clim[1] : ""; legendUnits.textContent = units ? `(${units})` : ""; legendEl.classList.remove("hidden"); } @@ -456,7 +465,8 @@

{{ name }}

return; } - const clim = renders.rescale?.[0] ?? [0, 100]; + const palette = renders["climate_api:palette"] ?? null; + const clim = palette ? [0, 255] : (renders.rescale?.[0] ?? [0, 100]); const colormapName = renders.colormap_name ?? "viridis"; const fillValue = renders.nodata ?? null; const variable = @@ -492,7 +502,7 @@

{{ name }}

let cm; try { - cm = buildColormap(colormapName); + cm = palette ? buildPaletteLut(palette) : buildColormap(colormapName); const zarrVersion = zarr["zarr:zarr_format"] ?? null; const selector = timeStepCount() > 0 ? { [timeDimKey]: 0 } : {}; @@ -546,7 +556,7 @@

{{ name }}

metaUnits.textContent = units || "—"; datasetMeta.classList.remove("hidden"); - updateLegend(cm, clim, units); + updateLegend(cm, palette ? null : clim, units); setStatus(""); } diff --git a/climate_api/transforms/__init__.py b/climate_api/transforms/__init__.py index 6c911944..d855881e 100644 --- a/climate_api/transforms/__init__.py +++ b/climate_api/transforms/__init__.py @@ -8,6 +8,6 @@ """ from .reproject import reproject_to_instance_crs -from .unit_conversion import kelvin_to_celsius, metres_to_mm +from .unit_conversion import kelvin_to_celsius, kg_per_m3_to_ug_per_m3, metres_to_mm -__all__ = ["kelvin_to_celsius", "metres_to_mm", "reproject_to_instance_crs"] +__all__ = ["kelvin_to_celsius", "kg_per_m3_to_ug_per_m3", "metres_to_mm", "reproject_to_instance_crs"] diff --git a/climate_api/transforms/reproject.py b/climate_api/transforms/reproject.py index d3842e9b..775f1a17 100644 --- a/climate_api/transforms/reproject.py +++ b/climate_api/transforms/reproject.py @@ -24,7 +24,7 @@ def reproject_to_instance_crs( ``params`` dict if your source uses a different input CRS. The dataset must already have ``x`` and ``y`` as its spatial dimension names, - which is guaranteed by ``build_dataset_zarr`` before transforms are run. + which is guaranteed by the ingest orchestrator before transforms are run. """ target_crs = api_config.get_crs() if target_crs == source_crs: diff --git a/climate_api/transforms/unit_conversion.py b/climate_api/transforms/unit_conversion.py index ec0243b1..47650591 100644 --- a/climate_api/transforms/unit_conversion.py +++ b/climate_api/transforms/unit_conversion.py @@ -25,3 +25,9 @@ def metres_to_mm(ds: xr.Dataset, dataset: dict[str, Any]) -> xr.Dataset: """Convert the dataset variable from metres to millimetres.""" logger.info("Converting '%s' from m to mm", dataset["variable"]) return _apply(ds, dataset, scale=1000.0, offset=0.0, units="mm") + + +def kg_per_m3_to_ug_per_m3(ds: xr.Dataset, dataset: dict[str, Any]) -> xr.Dataset: + """Convert the dataset variable from kg m⁻³ to μg m⁻³ (×10⁹).""" + logger.info("Converting '%s' from kg/m³ to μg/m³", dataset["variable"]) + return _apply(ds, dataset, scale=1e9, offset=0.0, units="μg m⁻³") diff --git a/docs/adding_custom_datasets.md b/docs/adding_custom_datasets.md index 0342fade..e862fe41 100644 --- a/docs/adding_custom_datasets.md +++ b/docs/adding_custom_datasets.md @@ -8,59 +8,10 @@ The built-in dataset templates (CHIRPS3, ERA5-Land, WorldPop) ship as package da Adding a custom dataset involves two things: -1. **A download function** — a Python function that downloads data and writes it as one or more NetCDF files to a given directory. -2. **A dataset template YAML** — a file that describes the dataset and tells the API which download function to call. +1. **An `IngestionPlugin` class** — streams data directly into an Icechunk store one period at a time. +2. **A dataset template YAML** — a file that describes the dataset and tells the API which plugin to call. -## Step 1: Write the download function - -The download function must be importable as a dotted Python path. The API calls it with keyword arguments and ignores the return value — the function is expected to write NetCDF files to `dirname` using `prefix` as the filename prefix. - -```python -# mypackage/sources/enacts.py -from pathlib import Path - -def download( - *, - start: str, # ISO 8601 date or datetime - end: str, - dirname: Path, # directory to write output files into - prefix: str, # filename prefix (use e.g. f"{prefix}_{year}.nc") - overwrite: bool, - bbox: list[float], # [xmin, ymin, xmax, ymax] — include only if your source needs it - **kwargs: object, # absorbs default_params from the YAML template -) -> None: - """Download ENACTS rainfall and write NetCDF files to dirname.""" - ... -``` - -**Required parameters** — always passed by the API: - -| Parameter | Type | Description | -| ----------- | ---------- | ----------- | -| `start` | `str` | Start of the requested time range (ISO 8601) | -| `end` | `str` | End of the requested time range (ISO 8601) | -| `dirname` | `Path` | Directory to write output NetCDF files into | -| `prefix` | `str` | Filename prefix for output files | -| `overwrite` | `bool` | Whether to overwrite existing cached files | - -**Optional parameters** — passed only when present in the function signature: - -| Parameter | Type | Description | -| -------------- | --------------- | ----------- | -| `bbox` | `list[float]` | Bounding box as `[xmin, ymin, xmax, ymax]` — include this if your source requires a spatial filter | -| `country_code` | `str` | ISO 3166-1 alpha-3 code — include this if your source (e.g. WorldPop) requires a country code | - -Any extra keyword arguments from `ingestion.default_params` in the YAML template are forwarded as additional kwargs. - -The API normalises coordinate names at write time: `valid_time` → `time`, `lat`/`latitude` → `y`, `lon`/`longitude` → `x`. Using the canonical names in your output avoids any ambiguity, but upstream names are handled automatically. - -Install your package in the same environment as the Climate API: - -```bash -pip install ./mypackage -``` - -## Step 2: Create a dataset template YAML +## Step 1: Create a dataset template YAML Create a directory for your custom templates and add a YAML file. Each file contains a list of templates (even if there is only one): @@ -75,7 +26,9 @@ Create a directory for your custom templates and add a YAML file. Each file cont kind: temporal execution: append ingestion: - function: mypackage.sources.enacts.download + plugin: mypackage.sources.EnactsPlugin + params: + variable: rainfall units: mm resolution: 4 km x 4 km source: ENACTS @@ -127,8 +80,8 @@ Omit `sync.availability` entirely for `static` datasets or when you always want | Field | Required | Description | | ----- | -------- | ----------- | -| `ingestion.function` | Yes | Dotted path to the download function | -| `ingestion.default_params` | No | Extra keyword arguments forwarded to the download function | +| `ingestion.plugin` | Yes | Dotted path to an `IngestionPlugin` class | +| `ingestion.params` | No | Constructor keyword arguments forwarded to the plugin class | **Transforms** — applied after download, before writing to Zarr: @@ -223,5 +176,89 @@ The smallest valid template for a static dataset with no sync: sync: kind: static ingestion: - function: mypackage.sources.my_source.download + plugin: mypackage.sources.my_plugin.MyPlugin ``` + +--- + +## Ingestion plugin + +For sources that need streaming access or resumable long ingests, implement an `IngestionPlugin` instead of a download function. The plugin streams data directly into the Icechunk store one period at a time — no intermediate files, no full-rebuild on sync. + +### Plugin skeleton + +```python +# mypackage/sources/my_plugin.py +from __future__ import annotations + +import asyncio +from typing import Any + +import numpy as np +import xarray as xr + +from climate_api.ingest.protocol import GridSpec, enumerate_periods + + +class MyPlugin: + max_concurrency = 2 # fetch this many periods in parallel + commit_batch_size = 1 # cursor checkpoint interval + + def __init__(self, variable: str) -> None: + self.variable = variable + + def probe(self, bbox: list[float], **_: Any) -> GridSpec: + """Return grid shape and CRS without downloading data.""" + # Derive shape from known resolution, or open a small metadata request. + xmin, ymin, xmax, ymax = bbox + res = 0.05 # degrees per pixel + import math + nx = max(1, math.ceil((xmax - xmin) / res)) + ny = max(1, math.ceil((ymax - ymin) / res)) + return GridSpec(shape=(ny, nx), crs=4326, dtype=np.dtype("float32"), nodata=-9999.0) + + def periods(self, start: str, end: str) -> list[str]: + """Return the ordered list of period IDs to fetch.""" + # enumerate_periods handles daily/hourly/monthly/yearly enumeration and + # optional availability cutoff clamping. + return enumerate_periods(start, end, "daily") + + def fetch_period(self, period_id: str, bbox: list[float], **_: Any) -> xr.Dataset: + """Fetch one period. Must return a Dataset with a 'time' dimension.""" + # Blocking I/O is fine — the orchestrator runs this in asyncio.to_thread. + ... +``` + +### Dataset template + +```yaml +- id: my_streaming_dataset + name: My streaming dataset + variable: rainfall + period_type: daily + sync: + kind: temporal + execution: append + extents: + spatial: + bbox: [-180, -50, 180, 50] + temporal: + begin: "2000-01-01" + resolution: P1D + ingestion: + plugin: mypackage.sources.my_plugin.MyPlugin + params: + variable: rainfall + units: mm + resolution: 5 km x 5 km + source: My source +``` + +### Key conventions for `fetch_period` + +- The returned Dataset must have a `time` dimension with exactly the period's time steps as coordinate values. +- Spatial dimensions should be named `x` and `y` (or match `GridSpec.x_dim` / `GridSpec.y_dim`). +- Clear all encoding before returning and pin the time encoding: `ds["time"].encoding.update({"units": "days since 1970-01-01", "dtype": "int32"})`. +- For sources where blocking I/O is unavoidable (rioxarray, requests), run it in a `ThreadPoolExecutor` as shown above. + +See the built-in plugins (`climate_api/ingest/plugins/`) for complete worked examples: `chirps3.py` (COG range requests), `era5_land.py` (remote zarr), and `worldpop.py` (full-file download). diff --git a/docs/architecture.md b/docs/architecture.md index 0f2a8a71..92f9012b 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -16,7 +16,7 @@ A template defines: - the dataset identifier and display metadata - the variable name, units, and period type -- how to download the data (`ingestion.function`) +- how to ingest the data (`ingestion.plugin`) - what transforms to apply (`transforms`) - what sync strategy to use (`sync.kind`, `sync.execution`) @@ -52,43 +52,32 @@ This is a deliberate design constraint: each instance serves one place. A Sierra ## Data lifecycle +All datasets are ingested through the plugin path (`ingestion.plugin`): + +**Plugin path** (`ingestion.plugin`) — streams data directly into an Icechunk store: + ``` Template (YAML) │ │ POST /ingestions (or POST /sync) ▼ -Ingestion - │ call ingestion function → NetCDF files on disk - │ apply transforms - │ reproject to instance CRS - │ write GeoZarr store - │ compute coverage (spatial + temporal extent of actual data) - ▼ -Artifact (internal record) +Orchestrator + │ probe() → fix chunk shape, write GeoZarr attributes + │ periods() → compare against committed store state + │ for each pending period: + │ fetch_period() → xr.Dataset (in source CRS) + │ to_zarr(icechunk_store, append_dim="time") + │ commit every period; checkpoint cursor every commit_batch_size periods + │ rechunk in-place (if rechunk_time is set) + │ expire intermediate snapshots + │ register ArtifactFormat.ICECHUNK artifact record │ │ publish=true ▼ -Managed dataset (public API) - ├── /datasets/{id} — native metadata - ├── /zarr/{id} — raw zarr store access - ├── /stac/collections/{id} — STAC discovery - └── /ogcapi/collections/{id} — OGC API access +Managed dataset (public API) — same endpoints as above ``` -The ingestion function is called identically by both `POST /ingestions` and `POST /sync` — the framework invokes it the same way regardless of the trigger. A correctly written ingestion function works for both without any changes. - -The framework is responsible for everything from "write zarr" onward. An ingestion function only needs to write NetCDF files to a given directory. The framework then: - -1. reads and normalises the coordinate names -2. applies transforms (unit conversion, etc.) -3. reprojects to the instance CRS -4. builds the zarr store with auto-computed chunking -5. writes GeoZarr root attributes (`spatial:bbox`, `proj:code`) so map clients can position tiles -6. computes artifact coverage (spatial bounds + time range) from the written data -7. stores the artifact record -8. publishes the managed dataset through pygeoapi if `publish=true` - -This division means that ingestion functions do not need to know about zarr conventions, STAC, OGC, or pygeoapi. They write data files; the framework handles everything else. +All ingest writes go directly to an Icechunk store — no intermediate files on disk. A crash leaves the store at the last committed period; restart resumes from there. The store is readable and serveable from the first committed period. --- @@ -132,43 +121,29 @@ Before executing a sync, the engine calls the availability function to clamp the The platform has four extension points. Each one has a narrow contract — the framework handles everything else automatically. -### Ingestion function +### Ingestion plugin ```python -def download( - *, - start: str, # ISO 8601 date or datetime - end: str, - dirname: Path, # write output files here - prefix: str, # use as filename prefix, e.g. f"{prefix}_{year}.nc" - overwrite: bool, - bbox: list[float], # optional — only if the source needs a spatial filter - **kwargs, # default_params from the YAML template -) -> None: - # Write one or more NetCDF files to dirname. -``` - -The function writes NetCDF files. The framework reads them, normalises coordinate names, applies transforms, reprojects to the instance CRS, builds the zarr, writes GeoZarr attributes, computes coverage, and registers the artifact. +class MyPlugin: + max_concurrency: int = 1 # parallel fetch limit + commit_batch_size: int = 1 # cursor checkpoint interval (every period is committed) -The ingestion function is called identically by `POST /ingestions` and `POST /sync`. The caller makes no difference to the function — it always receives the same parameters. + async def probe(self, bbox: list[float], **params) -> GridSpec: + """Metadata-only probe — no data transfer.""" + ... -**Reusing ingestion logic across templates**: multiple YAML templates can reference the same Python function and differentiate via `default_params`. This is the intended pattern for sources that have the same fetching logic but expose different variables: + async def periods(self, start: str, end: str) -> list[str]: + """Return the ordered list of period IDs available from start to end.""" + ... -```yaml -# era5land_temperature_hourly.yaml -ingestion: - function: dhis2eo.data.era5_land.download - default_params: - variable: 2m_temperature - -# era5land_precipitation_hourly.yaml -ingestion: - function: dhis2eo.data.era5_land.download - default_params: - variable: total_precipitation + async def fetch_period(self, period_id: str, bbox: list[float], **params) -> xr.Dataset: + """Fetch one period. Return a Dataset with a 'time' dimension in source CRS.""" + ... ``` -No framework changes are needed to support a new variable from the same source. +The orchestrator calls `probe()` once, `periods()` once, then drives a bounded-concurrency fetch loop — writing each period directly to an Icechunk store (one Icechunk commit per period) and checkpointing the job cursor every `commit_batch_size` periods. Plugins never touch zarr or Icechunk directly. + +See [Extensibility — Ingestion plugins](extensibility.md#ingestion-plugins) for the full protocol and `GridSpec` reference. ### Transform function @@ -179,7 +154,7 @@ def my_transform(ds: xr.Dataset, dataset: dict) -> xr.Dataset: # Do not modify dataset-level ds.attrs — the framework manages those. ``` -Transforms are applied in order after the ingestion function returns, before the zarr is written. They receive the full xarray Dataset and the template dict. They return a modified Dataset. They do not write to disk. +Transforms are applied in order after each period is fetched, before the data is written to the Icechunk store. They receive the full xarray Dataset and the template dict. They return a modified Dataset. They do not write to disk. ### Process execution function @@ -197,15 +172,14 @@ Processes are named operations triggered via `POST /processes/{id}/execution`. T Transforms are applied at a consistent point in the ingestion lifecycle: -1. ingestion function writes raw NetCDF files to disk -2. framework reads and normalises the data into an xarray Dataset -3. `_run_transforms(ds, dataset)` applies each declared transform in order -4. result is reprojected to instance CRS -5. zarr store is written with auto-computed chunking -6. framework writes GeoZarr root attributes -7. framework computes coverage from the zarr +1. plugin fetches one period of raw data as an xarray Dataset in the source CRS +2. `_run_transforms(ds, dataset)` applies each declared transform in order +3. orchestrator writes the period to the Icechunk store +4. framework writes GeoZarr root attributes after the first period is committed -Transforms see post-download, pre-reproject data. They should only modify data values and variable-level attributes. The framework writes dataset-level attributes (GeoZarr) after the transform pipeline completes. +Transforms see raw fetched values in the source CRS and source units. They should only modify data values and variable-level attributes. The framework writes dataset-level attributes (GeoZarr) after the first write completes. + +No automatic reprojection occurs. Data is stored in whatever CRS the plugin returns (declared via `GridSpec.crs` in `probe()`). If CRS conversion is needed, declare `reproject_to_instance_crs` as an explicit transform in the `transforms` list. --- @@ -219,13 +193,24 @@ Every zarr artifact must have GeoZarr root attributes for map rendering to work The map viewer reads `spatial:bbox` and `proj:code` to determine where to position tiles on the map. -**The framework writes these attributes — plugins do not.** They are written in `build_dataset_zarr` after transforms and reprojection, using the actual coordinate bounds of the final written data and the instance CRS. +**The framework writes these attributes — plugins do not.** They are written by the orchestrator after the first period is committed, using the actual coordinate bounds of the written data and the CRS declared in `GridSpec.crs`. --- ## CRS handling -The instance CRS is configured in `climate-api.yaml`: +Datasets are stored in whatever CRS the plugin returns. The plugin declares this via `GridSpec.crs` in its `probe()` response, and the framework writes `proj:code` accordingly. No automatic reprojection occurs. + +If you need to reproject data to a specific CRS, declare `reproject_to_instance_crs` as an explicit transform in the dataset template: + +```yaml +transforms: + - function: climate_api.transforms.reproject_to_instance_crs + params: + source_crs: EPSG:32633 +``` + +The instance CRS (used as the reprojection target when `reproject_to_instance_crs` is declared) is configured in `climate-api.yaml`: ```yaml extent: @@ -234,10 +219,6 @@ extent: crs: EPSG:32633 # optional; defaults to EPSG:4326 ``` -Downloaded data is reprojected from the source CRS (`source_crs` in the template, default `EPSG:4326`) to the instance CRS during ingestion. The stored zarr is always in the instance CRS. - -If no `crs` is set in the config, data is stored in `EPSG:4326` (WGS84). This is the correct default for instances that do not need a metric CRS. - --- ## Artifact deduplication and version history @@ -260,11 +241,10 @@ Plugin code (ingestion functions, transforms, processes) can rely on the followi | Concern | Where handled | | ----------------------------------------------------- | ------------------------------------------- | -| Coordinate name normalisation (`lat` → `y`, etc.) | `build_dataset_zarr` | -| Reprojection to instance CRS | `reproject_to_instance_crs` | -| Zarr chunking (auto-sized from `extents.temporal.resolution`) | `_compute_time_space_chunks` | -| Multiscale pyramid generation (when dims > 2048×2048) | `build_dataset_zarr` | -| GeoZarr root attributes (`spatial:bbox`, `proj:code`) | `build_dataset_zarr` | +| Coordinate name normalisation (`lat` → `y`, etc.) | Plugin (returns canonical `x`/`y`/`time`) | +| Zarr chunk sizing (time: 1 per period → rechunk pass) | `rechunk_store` (if `rechunk_time` set) | +| Multiscale pyramid generation (when dims > 2048×2048) | `build_pyramid_store` (if `pyramid=True`) | +| GeoZarr root attributes (`spatial:bbox`, `proj:code`) | Orchestrator after first period commit | | Artifact coverage computation | `_coverage_from_dataset` | | Artifact record persistence | `_store_artifact` | | pygeoapi publication | `publish_artifact_record` if `publish=true` | @@ -284,10 +264,12 @@ Each instance is configured for one place. This keeps the data model simple (no The sync engine validates that new data connects to the end of the existing artifact before appending. If a gap exists, the sync fails rather than silently producing a dataset with a hole. This is a deliberate constraint: downstream consumers (DHIS2, CHAP) depend on continuous time series and should not receive data with silent gaps. -### The append execution mode avoids re-downloading history +### The append execution mode + +For **legacy ZARR datasets** (downloader-based, no `ingestion.plugin`), `append` downloads only the missing time range and rebuilds the full zarr from all cached files. The local cache (NetCDF files in `data/downloads/`) is the source of truth; the zarr is a derived view. If the cache is deleted, a rematerialize is required to recover. -`append` downloads only the missing range and rebuilds the full zarr from all cached files. This means the local cache (NetCDF files in `data/downloads/`) is the source of truth for the full time series; the zarr is a derived view. If the cache is deleted, a rematerialize is required to recover. +For **plugin-path** datasets, `append` compares the pending period list against the already-committed time coordinates in the Icechunk store and fetches only the missing periods. The Icechunk store itself is the source of truth — no separate download cache. A crash leaves the store at the last committed period; restart resumes from there without any additional recovery logic. -### Transforms run after download, before reproject +### Transforms run per period, before the zarr write -Transforms see raw downloaded values in the source CRS and source units. The order is: download → transform → reproject → write zarr. +Transforms see raw fetched values in the plugin's source CRS and units. The order per period is: fetch → transform → write zarr. diff --git a/docs/built_in_datasets.md b/docs/built_in_datasets.md index bcf0e53f..4f68bc5c 100644 --- a/docs/built_in_datasets.md +++ b/docs/built_in_datasets.md @@ -21,7 +21,9 @@ To ingest a built-in dataset for your configured extent, see the [API reference] CHIRPS (Climate Hazards Group InfraRed Precipitation with Station data) v3 is a quasi-global daily precipitation dataset merging satellite thermal infrared imagery with station observations. It is widely used for drought monitoring, food security analysis, and WASH planning in low- and middle-income countries. -**Sync behaviour** — new data is ingested incrementally as it becomes available. CHIRPS has a nominal publication lag of around 3–7 days, so data through yesterday is not always present. The API uses a custom availability function that checks the actual latest available date from the CHIRPS server before each sync. +**Ingest method** — each day is fetched as a Cloud-Optimized GeoTIFF via HTTP range request. Only the configured bbox window is downloaded; full global files are never transferred. Up to four days are fetched concurrently and written directly to the Icechunk store — no intermediate files on disk. + +**Sync behaviour** — new days are appended incrementally. CHIRPS final data lags approximately 1–2 months (exact cutoff: end of the previous month if today is after the 20th, else two months back). Only the missing days are fetched on each sync run. **Transforms** — none applied; data is stored as received in mm. @@ -42,7 +44,9 @@ CHIRPS (Climate Hazards Group InfraRed Precipitation with Station data) v3 is a ERA5-Land is a global atmospheric reanalysis produced by ECMWF. The 2 m temperature variable (`t2m`) represents the air temperature 2 metres above the land surface, including corrections for topography relative to the ERA5 pressure levels. -**Sync behaviour** — new hours are appended incrementally. ERA5-Land is published with a nominal 5-day lag; the API will not request data closer than 120 hours to the current time. +**Ingest method** — the DestinE zarr store is opened lazily over HTTPS. Individual hourly periods are fetched and written directly to the Icechunk store — no intermediate files on disk. The source's 0–360° longitude range is converted to −180–180° before writing. `commit_batch_size = 720` checkpoints the cursor once per month of hourly data. + +**Sync behaviour** — new months are appended incrementally. ERA5-Land is published with a nominal 5-day lag; months closer than 120 hours to today are not requested. **Transforms** — raw values are in Kelvin. The `kelvin_to_celsius` transform is applied at ingest time, so stored values are in °C. @@ -63,7 +67,9 @@ ERA5-Land is a global atmospheric reanalysis produced by ECMWF. The 2 m temperat Total precipitation (`tp`) from ERA5-Land is an accumulated hourly value representing the sum of large-scale and convective precipitation falling onto the land surface. It is useful as a high-resolution complement to CHIRPS for countries outside CHIRPS's 50°N–50°S band, or for sub-daily analysis. -**Sync behaviour** — same 5-day lag as ERA5-Land temperature; hours are appended incrementally. +**Ingest method** — same as ERA5-Land temperature: individual hourly periods fetched from DestinE and written directly to Icechunk. + +**Sync behaviour** — same 5-day lag as ERA5-Land temperature; months are appended incrementally. **Transforms** — raw values are in metres per hour. The `metres_to_mm` transform converts to mm at ingest time. @@ -85,7 +91,9 @@ Total precipitation (`tp`) from ERA5-Land is an accumulated hourly value represe WorldPop Global2 provides gridded population estimates and projections at 100 m resolution. Each raster year represents estimated residential population counts. Years up to and including the present are backward-modelled estimates; years beyond the present are forward projections. -**Sync behaviour** — population data is released year by year, not as a continuous stream. The API uses a `release`-kind sync that checks each calendar year separately. Future years (projections) are also requestable, since the underlying data covers through 2030. +**Ingest method** — each year is downloaded as a per-country GeoTIFF from WorldPop's HTTP server (typically 50–200 MB per file), clipped to the configured bbox, and written directly to the Icechunk store. A multiscale pyramid is built after the initial ingest. The country code is taken from `extent.country_code` in `climate-api.yaml` (preferred) or from `ingestion.params.country_code` in the dataset template. + +**Sync behaviour** — population data is released year by year. The API uses a `release`-kind sync that checks each calendar year separately. Future years (projections through 2030) are also requestable. **Transforms** — none applied; values are stored as received (population counts per pixel). diff --git a/docs/extensibility.md b/docs/extensibility.md index f80c3e2d..bae7169e 100644 --- a/docs/extensibility.md +++ b/docs/extensibility.md @@ -7,7 +7,7 @@ The same pattern applies at every extension point: | Extension point | How to extend | Plugin location | | --------------- | ------------- | --------------- | | [Dataset templates](#dataset-templates) | YAML files | `plugins_dir/datasets/` | -| [Ingestion functions](#ingestion-functions) | Python function, dotted path in YAML | any importable path | +| [Ingestion plugins](#ingestion-plugins) | Python class implementing `IngestionPlugin` | any importable path | | [Transform functions](#transform-functions) | Python function, dotted path in YAML | any importable path | | [Processes](#processes) | YAML file + Python function | `plugins_dir/processes/` | @@ -34,16 +34,69 @@ See [Adding custom datasets](adding_custom_datasets.md) for the full template fi --- -## Ingestion functions +## Ingestion plugins -The `ingestion.function` field in a dataset template is a dotted Python path to the download function that fetches data for that dataset. +The `ingestion.plugin` field in a dataset template is a dotted Python path to an `IngestionPlugin` class. The plugin streams data directly into the Icechunk store one period at a time — no intermediate files, resumable on restart. ```yaml ingestion: - function: mypackage.sources.enacts.download + plugin: mypackage.sources.MyPlugin + params: + variable: rainfall + stage: final ``` -The function must follow the download function contract (see [Adding custom datasets](adding_custom_datasets.md#step-1-write-the-download-function)). It can live anywhere that is importable — either an installed package or a module placed directly under `plugins_dir` (which is automatically added to `sys.path`). +### Plugin protocol + +A plugin implements three focused async methods. The Climate API layer owns the orchestration loop — plugins never write to zarr or Icechunk directly: + +```python +from climate_api.ingest.protocol import GridSpec +import xarray as xr + +class MyPlugin: + max_concurrency: int = 1 # parallel fetch limit + commit_batch_size: int = 1 # cursor checkpoint interval (every period is committed) + + def probe(self, bbox: list[float], **params) -> GridSpec: + """Metadata-only source probe. Returns grid shape, CRS, dtype. No data transfer.""" + ... + + def periods(self, start: str, end: str) -> list[str]: + """Return the ordered list of available period IDs from start to end.""" + ... + + def fetch_period(self, period_id: str, bbox: list[float], **params) -> xr.Dataset: + """Fetch one period. Return a dataset with a 'time' dimension in source CRS.""" + ... +``` + +**`GridSpec`** is the return type of `probe()`: + +```python +@dataclass +class GridSpec: + shape: tuple[int, int] # (ny, nx) grid dimensions + crs: int # EPSG code, e.g. 4326 or 32633 + dtype: np.dtype # data type, e.g. np.dtype("float32") + nodata: float | None = None # fill value + time_dim: bool = True # False for static (time-invariant) datasets + extra_dims: dict[str, int] = field(default_factory=dict) # e.g. {"age_group": 20} +``` + +Set `time_dim=False` for static (time-invariant) datasets — the orchestrator issues a single write with no append dimension. + +### What the orchestrator does + +1. Calls `probe()` once to fix the Icechunk store's chunk shape and write GeoZarr attributes. +2. Calls `periods()` once to get the full period list; filters against already-committed time coordinates. +3. Creates all fetch tasks upfront so up to `max_concurrency` fetches are in flight simultaneously. +4. Awaits tasks in chronological order so writes are always sequential. +5. Commits every period to the Icechunk store; checkpoints the job cursor every `commit_batch_size` periods so a restart resumes from the last checkpoint rather than the beginning. +6. On restart, resumes from the last committed period — a crash loses at most one uncommitted batch. +7. After all periods are written, runs a rechunk pass if the plugin declares `rechunk_time`, then expires intermediate Icechunk snapshots to prune history. + +See [Adding custom datasets](adding_custom_datasets.md#ingestion-plugin) for a worked example. --- diff --git a/docs/implementation-status.md b/docs/implementation-status.md index cec1425a..dd061ca8 100644 --- a/docs/implementation-status.md +++ b/docs/implementation-status.md @@ -153,7 +153,7 @@ Published Zarr-backed managed datasets appear there as one STAC Collection per d Current STAC details: -- pyramid Zarr stores (detected by the presence of a `0/` level on disk) expose `/zarr/{dataset_id}/0` as the canonical asset href +- Icechunk pyramid stores (detected by `multiscales` in root attrs) expose the root `/zarr/{dataset_id}` as the asset href so `@carbonplan/zarr-layer` can read the `multiscales` metadata and select the appropriate zoom level automatically; regular flat-Zarr pyramid stores (legacy) expose `/zarr/{dataset_id}/0` - temporal extents are normalized to RFC 3339 in both STAC and Datacube temporal extent fields - STAC collection `license` currently defaults to `various` - spatial `step` values are rounded for readability while preserving axis direction diff --git a/docs/instance_guide.md b/docs/instance_guide.md index 765696d8..3d99ca34 100644 --- a/docs/instance_guide.md +++ b/docs/instance_guide.md @@ -29,7 +29,7 @@ my-climate-service/ ├── .gitignore ├── plugins/ │ ├── datasets/ # custom dataset template YAMLs -│ ├── / # custom download / ingestion functions +│ ├── / # custom ingestion plugin modules │ │ ├── __init__.py │ │ └── daily.py │ ├── transforms/ # custom transform functions @@ -160,7 +160,7 @@ Visit `http://localhost:8000` to confirm the API is running. The `/extent` endpo ## Adding plugins -Plugins extend the instance with custom datasets, download functions, transforms, and processes. They live in `plugins_dir` and are loaded automatically at startup. The `plugins_dir` is added to `sys.path`, so Python modules placed directly inside it are importable. +Plugins extend the instance with custom datasets, ingestion plugins, transforms, and processes. They live in `plugins_dir` and are loaded automatically at startup. The `plugins_dir` is added to `sys.path`, so Python modules placed directly inside it are importable. ``` plugins/ @@ -168,7 +168,7 @@ plugins/ │ └── enacts_rainfall.yaml # custom dataset template ├── enacts/ │ ├── __init__.py -│ └── daily.py # download function referenced in the YAML +│ └── plugin.py # IngestionPlugin class referenced in the YAML ├── transforms/ │ ├── __init__.py │ └── enacts.py # transform function @@ -177,7 +177,7 @@ plugins/ └── spatial_stats.py ``` -See [Extensibility](extensibility.md) for the full specification of each extension point, and [Adding custom datasets](adding_custom_datasets.md) for the dataset template field reference and download function contract. +See [Extensibility](extensibility.md) for the full specification of each extension point, and [Adding custom datasets](adding_custom_datasets.md) for the dataset template field reference and plugin contract. --- diff --git a/docs/transforms.md b/docs/transforms.md index 83f3e47f..e7fd43f9 100644 --- a/docs/transforms.md +++ b/docs/transforms.md @@ -50,12 +50,15 @@ Used by: ERA5-Land total precipitation (`era5land_precipitation_hourly`). ## Reprojection -Reprojection to the instance CRS is handled automatically by the ingestion pipeline as a separate step after all user-declared transforms have run. It is not a transform and should not be declared in the `transforms` list. +No automatic reprojection occurs. Data is stored in whatever CRS the plugin returns (declared via `GridSpec.crs` in `probe()`). -If your source data is not in `EPSG:4326`, declare `source_crs` in the dataset template so the pipeline knows what to reproject from: +If you need to reproject to a different CRS, declare `reproject_to_instance_crs` as an explicit transform. It reprojects from `source_crs` (default `EPSG:4326`) to the instance CRS set in `climate-api.yaml`: ```yaml -source_crs: EPSG:32633 +transforms: + - function: climate_api.transforms.reproject_to_instance_crs + params: + source_crs: EPSG:32633 ``` --- diff --git a/docs/zarr_and_geozarr.md b/docs/zarr_and_geozarr.md index 48ae3f52..89132dea 100644 --- a/docs/zarr_and_geozarr.md +++ b/docs/zarr_and_geozarr.md @@ -38,7 +38,6 @@ The two halves of the term map directly onto the choices described in this docum **Analysis-ready** means a consumer can open the data and start computing without preprocessing: - Dimension names are normalised to `(time, x, y)` regardless of the source convention. -- All datasets in an instance share a single coordinate reference system. - Units are standardised by the transform pipeline (e.g. Kelvin → Celsius). **Cloud-optimized** means the data can be accessed efficiently over HTTP without downloading the whole file. The Zarr and GeoZarr formats provide all the necessary properties — chunk-level access, HTTP-native serving, multiscale pyramids, and cloud compatibility. @@ -47,14 +46,49 @@ The Climate API targets the same access pattern at country scale for arbitrary s --- +## Icechunk — versioned Zarr storage + +[Icechunk](https://icechunk.io) is a transactional storage layer that sits between the application and the underlying Zarr v3 data. It exposes a standard Zarr store interface to writers and readers, but adds **MVCC (multi-version concurrency control)**: every write is committed as an immutable snapshot, and readers always see a consistent view of the data regardless of concurrent writes. + +### Why Icechunk + +Plain Zarr on disk is a directory of independent chunk files — there is no transaction boundary. If an ingest is interrupted mid-write, some chunks for a new time step may be written and others not, leaving the store in an inconsistent state with no way to roll back. + +The Climate API ingests one period at a time, committing each as a separate Icechunk snapshot. This gives three concrete properties: + +- **Safe resume** — if a job is cancelled or the server restarts, the next run reads the list of committed snapshots and skips periods that are already present. No partial writes are ever visible to readers. +- **Snapshot isolation** — a read session opened at the start of a request sees a consistent snapshot even if a concurrent ingest is writing new periods. Readers are never blocked by writers. +- **Prunable history** — intermediate per-period snapshots accumulate during ingest. After the rechunk and pyramid passes complete, `expire_snapshots()` prunes all but the latest, keeping disk usage proportional to data size rather than ingest history. + +### Snapshot lifecycle + +A typical WorldPop ingest produces snapshots roughly like this: + +``` +snapshot 1: write period 2015 +snapshot 2: write period 2016 +... +snapshot 16: write period 2030 +snapshot 17: rechunk: time=1 +snapshot 18: pyramid: 6 levels +→ expire_snapshots() prunes snapshots 1–17 +snapshot 18: (the only surviving snapshot — full pyramid, correctly chunked) +``` + +### Serving from Icechunk + +Zarr keys are read directly from the Icechunk session store rather than from files on disk. The HTTP surface is identical — the same `/zarr/{dataset_id}/` routes — but the backend resolves each key through the Icechunk MVCC layer instead of a `FileResponse`. + +--- + ## Store layout on disk -Each managed dataset has exactly one Zarr store on disk, stored under `{data_dir}/downloads/{dataset_id}.zarr`. The store is either: +Each managed dataset has exactly one Icechunk repository on disk, stored under `{data_dir}/downloads/{dataset_id}.icechunk`. The zarr content inside the repository is either: -- **Flat** — a single-resolution Zarr store with dimensions `(time, x, y)` -- **Pyramid** — a multi-resolution Zarr store with levels `0/`, `1/`, `2/`, … where `0/` is the full resolution +- **Flat** — a single-resolution store with dimensions `(time, x, y)` +- **Pyramid** — a multi-resolution store with levels `0/`, `1/`, `2/`, … where `0/` is the full resolution -The flat vs. pyramid decision is made at build time based on spatial size (see [Multiscale pyramids](#multiscale-pyramids) below). +The flat vs. pyramid decision is made at ingest time based on spatial size (see [Multiscale pyramids](#multiscale-pyramids) below). --- @@ -104,7 +138,7 @@ A plain Zarr store has no concept of spatial coordinates. A map viewer opening i | `proj:code` | `EPSG:4326` | CRS of the stored coordinates | | `zarr_conventions` | `[{...}]` | Convention declarations | -These attributes are computed from the actual coordinate bounds of the written data and the instance CRS. They are always written by the framework after transforms and reprojection have run. This guarantees they always reflect the final stored data. +These attributes are computed from the actual coordinate bounds of the written data and the CRS declared by the plugin in `GridSpec.crs`. They are written by the framework after the first period is committed. This guarantees they always reflect the final stored data. `zarr_conventions` for a flat store contains the base GeoZarr convention declaration. For pyramid stores it also includes a multiscales entry that declares the level structure. @@ -120,7 +154,7 @@ extent: crs: EPSG:32633 # optional; defaults to EPSG:4326 ``` -Datasets are always stored in the instance CRS. During ingestion, data is reprojected from its source CRS (declared as `source_crs` in the template, default `EPSG:4326`) to the instance CRS. The stored `spatial:bbox` is therefore in the instance CRS — UTM eastings and northings for a UTM instance, degrees for a WGS84 instance. +Datasets are stored in whatever CRS the plugin returns. The plugin declares this via `GridSpec.crs` in its `probe()` response, and the framework writes `proj:code` from that value. No automatic reprojection occurs — if CRS conversion is needed, declare `reproject_to_instance_crs` as an explicit transform in the dataset template. The stored `spatial:bbox` is in the plugin's native CRS. STAC metadata also stores the WGS84 bounding box alongside the native bbox, so catalogue clients that expect geographic coordinates always get one regardless of the instance CRS. @@ -128,7 +162,7 @@ STAC metadata also stores the WGS84 bounding box alongside the native bbox, so c ## How Zarr stores are served -The `/zarr/{dataset_id}/` endpoint serves individual files from the Zarr store directory using FastAPI's `FileResponse`. The ZarrLayer client issues one HTTP request per chunk file it needs. +The `/zarr/{dataset_id}/` endpoint serves Zarr keys from the Icechunk repository. The ZarrLayer client issues one HTTP request per key it needs: ``` GET /zarr/{dataset_id}/zarr.json → root metadata (JSON) @@ -136,9 +170,9 @@ GET /zarr/{dataset_id}/precip/c/0/0/0 → chunk at time=0, x=0, y=0 GET /zarr/{dataset_id}/time/c/0 → time coordinate chunk ``` -Metadata files (`zarr.json`) are returned as `application/json`. All other files — chunk data — are returned as `application/octet-stream`. Directory paths return a JSON listing of their contents. +Each request opens a readonly Icechunk session pinned to the latest committed snapshot, resolves the zarr key through the MVCC layer, and returns the raw bytes. Metadata files (`zarr.json`) are returned as `application/json`; chunk data as `application/octet-stream`; directory paths as a JSON listing. -This design means the zarr store is served by ordinary file serving — there is no zarr-specific server middleware. +The HTTP surface is identical to serving files from disk — ZarrLayer and other zarr clients require no changes — but correctness and consistency are guaranteed by Icechunk's snapshot model rather than filesystem state. --- diff --git a/examples/stac_discover_and_open.py b/examples/stac_discover_and_open.py index 3e845274..88fffaff 100644 --- a/examples/stac_discover_and_open.py +++ b/examples/stac_discover_and_open.py @@ -28,16 +28,17 @@ def main() -> None: ds = api.open(first["id"]) print(ds) - print(f"\nTime range: {ds.time.values[0]} → {ds.time.values[-1]}") - print(f"Time steps: {ds.sizes['time']}") - print(f"Latitude: {ds.latitude.min().item()} → {ds.latitude.max().item()}") - print(f"Longitude: {ds.longitude.min().item()} → {ds.longitude.max().item()}") + if "time" in ds.dims: + print(f"\nTime range: {ds.time.values[0]} → {ds.time.values[-1]}") + print(f"Time steps: {ds.sizes['time']}") + print(f"y: {ds.y.min().item():.4f} → {ds.y.max().item():.4f}") + print(f"x: {ds.x.min().item():.4f} → {ds.x.max().item():.4f}") variable = list(ds.data_vars)[0] - centre_lat = ds.latitude.mean().item() - centre_lon = ds.longitude.mean().item() - sample = ds[variable].isel(time=0).sel(latitude=centre_lat, longitude=centre_lon, method="nearest") - print(f"\n{variable} at domain centre, t=0: {sample.item()}") + centre = {"y": ds.y.mean().item(), "x": ds.x.mean().item()} + selector = {"time": 0} if "time" in ds.dims else {} + sample = ds[variable].isel(**selector).sel(**centre, method="nearest").compute() + print(f"\n{variable} at domain centre, {'t=0' if selector else 'static'}: {sample.item()}") if __name__ == "__main__": diff --git a/examples/zarr_direct_access.py b/examples/zarr_direct_access.py index 5cc722df..d2129d71 100644 --- a/examples/zarr_direct_access.py +++ b/examples/zarr_direct_access.py @@ -24,29 +24,34 @@ def main() -> None: print(ds) print(f"\nDimensions: {dict(ds.sizes)}") - print(f"Time range: {ds.time.values[0]} → {ds.time.values[-1]}") - print(f"Latitude: {ds.latitude.min().item()} → {ds.latitude.max().item()}") - print(f"Longitude: {ds.longitude.min().item()} → {ds.longitude.max().item()}") + if "time" in ds.dims: + print(f"Time range: {ds.time.values[0]} → {ds.time.values[-1]}") + print(f"y: {ds.y.min().item():.4f} → {ds.y.max().item():.4f}") + print(f"x: {ds.x.min().item():.4f} → {ds.x.max().item():.4f}") variable = list(ds.data_vars)[0] # Select a single time step - t0 = ds.time.values[0] - snapshot = ds[variable].sel(time=t0) - print(f"\n{variable} snapshot at {t0}:") - print(f" shape: {snapshot.shape}, min: {snapshot.min().item()}, max: {snapshot.max().item()}") + if "time" in ds.dims: + t0 = ds.time.values[0] + snap = ds[variable].sel(time=t0).compute() + print(f"\n{variable} snapshot at {t0}:") + print(f" shape: {snap.shape}, min: {snap.min().item():.4f}, max: {snap.max().item():.4f}") # Select the point closest to the spatial centre of the domain - centre_lat = ds.latitude.mean().item() - centre_lon = ds.longitude.mean().item() - point = ds[variable].sel(latitude=centre_lat, longitude=centre_lon, method="nearest") - print(f"\n{variable} at domain centre ({centre_lat:.2f}, {centre_lon:.2f}):") - print(point.to_dataframe()[[variable]].head(10)) + cy, cx = ds.y.mean().item(), ds.x.mean().item() + point = ds[variable].sel(y=cy, x=cx, method="nearest") + print(f"\n{variable} at domain centre ({cy:.2f}, {cx:.2f}):") + if "time" in ds.dims: + print(point.to_dataframe()[[variable]].head(10)) + else: + print(f" value: {point.compute().item()}") # Spatial mean over the full domain — first 10 time steps - spatial_mean = ds[variable].isel(time=slice(10)).mean(dim=["latitude", "longitude"]) - print(f"\nSpatial mean {variable} time series (first 10 steps):") - print(spatial_mean.to_dataframe()[[variable]]) + if "time" in ds.dims: + spatial_mean = ds[variable].isel(time=slice(10)).mean(dim=["y", "x"]) + print(f"\nSpatial mean {variable} time series (first 10 steps):") + print(spatial_mean.to_dataframe()[[variable]]) if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 8c96f96f..1e1cd40c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,8 @@ dependencies = [ "topozarr==0.0.*", "rioxarray>=0.17", "portalocker>=3.2.0", - "dhis2eo>=1.2.1", + "icechunk>=2.0,<3", + "python-multipart>=0.0.29", ] [project.urls] diff --git a/tests/test_client.py b/tests/test_client.py index 16e1a0b4..44509d36 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -1,13 +1,11 @@ from pathlib import Path from unittest.mock import MagicMock, patch -import httpx import numpy as np import pandas as pd -import pytest import xarray as xr -from climate_api.client import Client, _id_from_href, list_datasets, open_dataset +from climate_api.client import Client, _id_from_href def _make_catalog(hrefs: list[str]) -> dict: @@ -51,93 +49,6 @@ def test_id_from_href_strips_trailing_slash() -> None: assert _id_from_href("http://localhost/stac/collections/ds/") == "ds" -# ── module-level list_datasets ───────────────────────────────────────────────── - - -def test_list_datasets_returns_child_links() -> None: - catalog = _make_catalog(["http://localhost/stac/collections/chirps3_precipitation_daily_rwa"]) - with patch("climate_api.client.httpx.get", return_value=_make_response(catalog)) as mock_get: - result = list_datasets("http://localhost") - - mock_get.assert_called_once_with("http://localhost/stac/catalog.json", timeout=30) - assert len(result) == 1 - assert result[0]["rel"] == "child" - assert "chirps3" in result[0]["href"] - - -def test_list_datasets_returns_empty_for_no_children() -> None: - catalog = {"links": [{"rel": "root", "href": "http://localhost/stac/catalog.json"}]} - with patch("climate_api.client.httpx.get", return_value=_make_response(catalog)): - result = list_datasets("http://localhost") - - assert result == [] - - -def test_list_datasets_raises_on_http_error() -> None: - with patch("climate_api.client.httpx.get") as mock_get: - mock_get.return_value.raise_for_status.side_effect = httpx.HTTPStatusError( - "404", request=MagicMock(), response=MagicMock() - ) - with pytest.raises(httpx.HTTPStatusError): - list_datasets("http://localhost") - - -# ── module-level open_dataset ────────────────────────────────────────────────── - - -def test_open_dataset_fetches_collection_and_opens_zarr(tmp_path: Path) -> None: - zarr_path = tmp_path / "test.zarr" - ds = xr.Dataset( - {"precip": (["time", "latitude", "longitude"], np.ones((2, 3, 3), dtype="float32"))}, - coords={ - "time": pd.date_range("2024-01-01", periods=2, freq="D"), - "latitude": [3.0, 2.0, 1.0], - "longitude": [10.0, 11.0, 12.0], - }, - ) - ds.to_zarr(str(zarr_path), mode="w", consolidated=True) - - collection = _make_collection(str(zarr_path)) - with patch("climate_api.client.httpx.get", return_value=_make_response(collection)): - result = open_dataset("chirps3_precipitation_daily_rwa", base_url="http://localhost") - - try: - assert "precip" in result.data_vars - assert result.sizes["time"] == 2 - assert "latitude" in result.coords - assert "longitude" in result.coords - finally: - result.close() - - -def test_open_dataset_raises_on_http_error() -> None: - with patch("climate_api.client.httpx.get") as mock_get: - mock_get.return_value.raise_for_status.side_effect = httpx.HTTPStatusError( - "404", request=MagicMock(), response=MagicMock() - ) - with pytest.raises(httpx.HTTPStatusError): - open_dataset("nonexistent", base_url="http://localhost") - - -def test_open_dataset_uses_default_base_url() -> None: - collection = _make_collection("/dev/null") - with patch("climate_api.client.httpx.get", return_value=_make_response(collection)) as mock_get: - with patch("climate_api.client.xr.open_zarr", return_value=MagicMock()): - open_dataset("any_dataset") - - mock_get.assert_called_once_with("http://127.0.0.1:8000/stac/collections/any_dataset", timeout=30) - - -def test_open_dataset_uses_env_var_base_url(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setenv("CLIMATE_API_BASE_URL", "http://env-host:9000") - collection = _make_collection("/dev/null") - with patch("climate_api.client.httpx.get", return_value=_make_response(collection)) as mock_get: - with patch("climate_api.client.xr.open_zarr", return_value=MagicMock()): - open_dataset("any_dataset") - - mock_get.assert_called_once_with("http://env-host:9000/stac/collections/any_dataset", timeout=30) - - # ── Client class ─────────────────────────────────────────────────────────────── diff --git a/tests/test_config.py b/tests/test_config.py index 6af76acc..e017ae84 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -206,7 +206,7 @@ def test_plugins_dir_adds_root_to_sys_path_and_makes_modules_importable( sync: kind: static ingestion: - function: myplugin.source.download + plugin: myplugin.source.MyPlugin """, encoding="utf-8", ) @@ -240,7 +240,7 @@ def test_plugins_dir_in_config_adds_to_bundled(monkeypatch: pytest.MonkeyPatch, sync: kind: static ingestion: - function: mypackage.sources.download + plugin: mypackage.sources.MyPlugin """, encoding="utf-8", ) @@ -273,7 +273,7 @@ def test_plugins_dir_resolved_relative_to_config_file(monkeypatch: pytest.Monkey sync: kind: static ingestion: - function: mypackage.sources.download + plugin: mypackage.sources.MyPlugin """, encoding="utf-8", ) @@ -299,7 +299,7 @@ def test_plugins_dir_in_config_overrides_bundled_by_id(monkeypatch: pytest.Monke sync: kind: static ingestion: - function: mypackage.sources.download + plugin: mypackage.sources.MyPlugin """, encoding="utf-8", ) diff --git a/tests/test_dataset_registry.py b/tests/test_dataset_registry.py index 9d44d290..d9757dc4 100644 --- a/tests/test_dataset_registry.py +++ b/tests/test_dataset_registry.py @@ -58,7 +58,7 @@ def test_dataset_registry_accepts_supported_sync_kind( sync: kind: temporal ingestion: - function: some.download.function + plugin: some.ingest.Plugin """, encoding="utf-8", ) @@ -129,63 +129,10 @@ def test_dataset_registry_accepts_supported_sync_execution( kind: temporal execution: append ingestion: - function: some.download.function + plugin: some.ingest.Plugin """, encoding="utf-8", ) monkeypatch.setattr(datasets, "CONFIGS_DIR", tmp_path) assert datasets.list_datasets()[0]["sync"]["execution"] == "append" - - -def test_dataset_registry_rejects_invalid_sync_availability_function( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - registry_file = tmp_path / "invalid_sync_availability.yaml" - registry_file.write_text( - """ -- id: invalid_sync_availability - name: Invalid sync availability - variable: value - period_type: daily - sync: - kind: temporal - availability: - latest_available_function: 42 - ingestion: - function: some.download.function -""", - encoding="utf-8", - ) - monkeypatch.setattr(datasets, "CONFIGS_DIR", tmp_path) - - with pytest.raises(ValueError, match="invalid sync.availability.latest_available_function"): - datasets.list_datasets() - - -def test_dataset_registry_accepts_sync_availability_function( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - registry_file = tmp_path / "valid_sync_availability.yaml" - registry_file.write_text( - """ -- id: valid_sync_availability - name: Valid sync availability - variable: value - period_type: daily - sync: - kind: temporal - availability: - latest_available_function: climate_api.providers.availability.lagged_latest_available - ingestion: - function: some.download.function -""", - encoding="utf-8", - ) - monkeypatch.setattr(datasets, "CONFIGS_DIR", tmp_path) - - assert datasets.list_datasets()[0]["sync"]["availability"]["latest_available_function"].endswith( - "lagged_latest_available" - ) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 7ece9ee2..f952b8ca 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,4 +1,4 @@ -from datetime import UTC, datetime, tzinfo +from datetime import UTC, datetime from pathlib import Path import pytest @@ -38,7 +38,6 @@ def _artifact( dataset_name="CHIRPS3 precipitation", variable="precip", format=ArtifactFormat.ZARR, - path="/tmp/chirps3_precipitation_daily.zarr", asset_paths=["/tmp/chirps3_precipitation_daily.zarr"], variables=["precip"], request_scope=ArtifactRequestScope( @@ -125,17 +124,22 @@ def test_dataset_links_include_stac_for_published_zarr() -> None: assert any(link.rel == "stac" and link.href == "/stac/collections/chirps3_precipitation_daily" for link in links) -def test_dataset_links_omit_stac_for_unpublished_or_netcdf() -> None: +def test_dataset_links_include_stac_for_published_icechunk() -> None: + artifact = _artifact(artifact_id="a1") + artifact = artifact.model_copy(update={"format": ArtifactFormat.ICECHUNK}) + + links = services._dataset_links("chirps3_precipitation_daily", artifact) + + assert any(link.rel == "stac" and link.href == "/stac/collections/chirps3_precipitation_daily" for link in links) + + +def test_dataset_links_omit_stac_for_unpublished() -> None: unpublished = _artifact(artifact_id="a1") unpublished.publication.status = PublicationStatus.UNPUBLISHED - netcdf = _artifact(artifact_id="a2") - netcdf.format = ArtifactFormat.NETCDF unpublished_links = services._dataset_links("chirps3_precipitation_daily", unpublished) - netcdf_links = services._dataset_links("chirps3_precipitation_daily", netcdf) assert all(link.rel != "stac" for link in unpublished_links) - assert all(link.rel != "stac" for link in netcdf_links) def test_list_ingestions_returns_most_recent_first(monkeypatch: pytest.MonkeyPatch) -> None: @@ -204,7 +208,6 @@ def test_find_existing_artifact_ignores_record_with_overwide_coverage() -> None: records=[stale_artifact, valid_artifact], dataset_id="chirps3_precipitation_daily", request_scope=request_scope, - prefer_zarr=True, ) assert result == valid_artifact @@ -228,7 +231,6 @@ def test_find_existing_artifact_ignores_stale_record(monkeypatch: pytest.MonkeyP records=[stale_artifact, valid_artifact], dataset_id="chirps3_precipitation_daily", request_scope=request_scope, - prefer_zarr=True, ) assert result == valid_artifact @@ -278,301 +280,6 @@ def test_temporal_coverage_matches_request_scope_allows_open_ended_reuse() -> No ) -def test_create_artifact_computes_coverage_from_created_artifact_paths( - monkeypatch: pytest.MonkeyPatch, tmp_path: Path -) -> None: - dataset: dict[str, object] = { - "id": "worldpop_population_yearly", - "name": "Total population (WorldPop Global12)", - "variable": "pop_total", - "period_type": "yearly", - } - created_file = tmp_path / "worldpop_population_yearly_2020.nc" - created_file.write_text("dummy", encoding="utf-8") - - monkeypatch.setattr(services.downloader, "download_dataset", lambda *_, **__: [created_file]) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: None) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) - - captured: dict[str, object] = {} - - def fake_get_data_coverage_for_paths( - dataset_arg: dict[str, object], - *, - zarr_path: str | None = None, - netcdf_paths: list[str] | None = None, - ) -> dict[str, object]: - captured["dataset_id"] = dataset_arg["id"] - captured["zarr_path"] = zarr_path - captured["netcdf_paths"] = netcdf_paths - return { - "coverage": { - "temporal": {"start": "2020", "end": "2020"}, - "spatial": {"xmin": -13.3, "ymin": 6.9, "xmax": -10.2, "ymax": 10.0}, - } - } - - monkeypatch.setattr(services, "get_data_coverage_for_paths", fake_get_data_coverage_for_paths) - monkeypatch.setattr(services, "_store_artifact_record", lambda record, **_: record) - - artifact = services.create_artifact( - dataset=dataset, - start="2020", - end="2020", - bbox=[-13.5, 6.9, -10.1, 10.0], - country_code="SLE", - overwrite=False, - prefer_zarr=False, - publish=False, - ) - - assert captured["dataset_id"] == "worldpop_population_yearly" - assert captured["zarr_path"] is None - assert captured["netcdf_paths"] == [str(created_file.resolve())] - assert artifact.coverage.temporal.start == "2020" - assert artifact.coverage.temporal.end == "2020" - - -def test_create_artifact_normalizes_request_scope_to_dataset_period( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - dataset: dict[str, object] = { - "id": "era5land_temperature_hourly", - "name": "2m temperature (ERA5-Land)", - "variable": "t2m", - "period_type": "hourly", - } - created_file = tmp_path / "era5land_temperature_hourly_2026-04-21.nc" - created_file.write_text("dummy", encoding="utf-8") - - captured_download: dict[str, object] = {} - - def fake_download_dataset( - dataset_arg: dict[str, object], - *, - start: str, - end: str | None, - **_: object, - ) -> list[Path]: - captured_download["dataset_id"] = dataset_arg["id"] - captured_download["start"] = start - captured_download["end"] = end - return [created_file] - - monkeypatch.setattr(services.downloader, "download_dataset", fake_download_dataset) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: None) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) - monkeypatch.setattr( - services, - "get_data_coverage_for_paths", - lambda *_, **__: { - "coverage": { - "temporal": {"start": "2026-04-21T12", "end": "2026-04-21T13"}, - "spatial": {"xmin": 1.0, "ymin": 2.0, "xmax": 3.0, "ymax": 4.0}, - } - }, - ) - monkeypatch.setattr(services, "_store_artifact_record", lambda record, **_: record) - - artifact = services.create_artifact( - dataset=dataset, - start="2026-04-21T12:15:00", - end="2026-04-21T13:45:00", - bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, - overwrite=False, - prefer_zarr=False, - publish=False, - ) - - assert captured_download == { - "dataset_id": "era5land_temperature_hourly", - "start": "2026-04-21T12", - "end": "2026-04-21T13", - } - assert artifact.request_scope.start == "2026-04-21T12" - assert artifact.request_scope.end == "2026-04-21T13" - - -def test_create_artifact_defaults_omitted_end_to_dataset_native_period_for_download( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - dataset: dict[str, object] = { - "id": "era5land_temperature_hourly", - "name": "2m temperature (ERA5-Land)", - "variable": "t2m", - "period_type": "hourly", - } - created_file = tmp_path / "era5land_temperature_hourly_2026-04-21.nc" - created_file.write_text("dummy", encoding="utf-8") - - captured_download: dict[str, object] = {} - - class FixedDateTime(datetime): - @classmethod - def now(cls, tz: tzinfo | None = None) -> "FixedDateTime": - return cls(2026, 4, 21, 13, 47, 31, tzinfo=tz if tz is UTC else None) - - def fake_download_dataset( - dataset_arg: dict[str, object], - *, - start: str, - end: str | None, - **_: object, - ) -> list[Path]: - captured_download["dataset_id"] = dataset_arg["id"] - captured_download["start"] = start - captured_download["end"] = end - return [created_file] - - monkeypatch.setattr(services, "utc_now", lambda: FixedDateTime(2026, 4, 21, 13, 47, 31, tzinfo=UTC)) - monkeypatch.setattr(services.downloader, "download_dataset", fake_download_dataset) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: None) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) - monkeypatch.setattr( - services, - "get_data_coverage_for_paths", - lambda *_, **__: { - "coverage": { - "temporal": {"start": "2026-04-21T12", "end": "2026-04-21T13"}, - "spatial": {"xmin": 1.0, "ymin": 2.0, "xmax": 3.0, "ymax": 4.0}, - } - }, - ) - monkeypatch.setattr(services, "_store_artifact_record", lambda record, **_: record) - - artifact = services.create_artifact( - dataset=dataset, - start="2026-04-21T12:15:00", - end=None, - bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, - overwrite=False, - prefer_zarr=False, - publish=False, - ) - - assert captured_download == { - "dataset_id": "era5land_temperature_hourly", - "start": "2026-04-21T12", - "end": "2026-04-21T13", - } - assert artifact.request_scope.end is None - - -def test_create_artifact_returns_409_when_downloaded_artifact_has_no_data( - monkeypatch: pytest.MonkeyPatch, tmp_path: Path -) -> None: - dataset: dict[str, object] = { - "id": "worldpop_population_yearly", - "name": "Total population (WorldPop Global12)", - "variable": "pop_total", - "period_type": "yearly", - } - created_file = tmp_path / "worldpop_population_yearly_2020.nc" - created_file.write_text("dummy", encoding="utf-8") - - monkeypatch.setattr(services.downloader, "download_dataset", lambda *_, **__: [created_file]) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: None) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) - monkeypatch.setattr( - services, - "get_data_coverage_for_paths", - lambda *_, **__: { - "has_data": False, - "coverage": { - "temporal": {"start": None, "end": None}, - "spatial": {"xmin": None, "ymin": None, "xmax": None, "ymax": None}, - }, - }, - ) - - with pytest.raises(services.HTTPException) as exc_info: - services.create_artifact( - dataset=dataset, - start="2020", - end="2020", - bbox=[-13.5, 6.9, -10.1, 10.0], - country_code="SLE", - overwrite=False, - prefer_zarr=False, - publish=False, - ) - - assert exc_info.value.status_code == 409 - assert exc_info.value.detail == "Downloaded artifact contains no data for the requested scope" - - -def test_create_artifact_can_download_delta_while_recording_full_request_scope( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - dataset: dict[str, object] = { - "id": "chirps3_precipitation_daily", - "name": "Total precipitation (CHIRPS3)", - "variable": "precip", - "period_type": "daily", - } - created_file = tmp_path / "chirps3_precipitation_daily_2026-02-01_2026-02-10.nc" - created_file.write_text("dummy", encoding="utf-8") - - captured_download: dict[str, object] = {} - - def fake_download_dataset( - dataset_arg: dict[str, object], - *, - start: str, - end: str | None, - **_: object, - ) -> list[Path]: - captured_download["dataset_id"] = dataset_arg["id"] - captured_download["start"] = start - captured_download["end"] = end - return [created_file] - - monkeypatch.setattr(services.downloader, "download_dataset", fake_download_dataset) - monkeypatch.setattr(services.downloader, "build_dataset_zarr", lambda *_, **__: None) - zarr_path_chirps = tmp_path / "chirps3_precipitation_daily.zarr" - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: zarr_path_chirps) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) - monkeypatch.setattr( - services, - "get_data_coverage_for_paths", - lambda *_, **__: { - "coverage": { - "temporal": {"start": "2026-01-01", "end": "2026-02-10"}, - "spatial": {"xmin": 1.0, "ymin": 2.0, "xmax": 3.0, "ymax": 4.0}, - } - }, - ) - monkeypatch.setattr(services, "_store_artifact_record", lambda record, **_: record) - - artifact = services.create_artifact( - dataset=dataset, - start="2026-01-01", - end="2026-02-10", - download_start="2026-02-01", - download_end="2026-02-10", - bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, - overwrite=False, - prefer_zarr=True, - publish=False, - ) - - assert captured_download == { - "dataset_id": "chirps3_precipitation_daily", - "start": "2026-02-01", - "end": "2026-02-10", - } - assert artifact.request_scope.start == "2026-01-01" - assert artifact.request_scope.end == "2026-02-10" - assert artifact.coverage.temporal.start == "2026-01-01" - assert artifact.coverage.temporal.end == "2026-02-10" - - def test_create_artifact_rejects_partial_download_scope(monkeypatch: pytest.MonkeyPatch) -> None: dataset: dict[str, object] = { "id": "chirps3_precipitation_daily", @@ -589,9 +296,7 @@ def test_create_artifact_rejects_partial_download_scope(monkeypatch: pytest.Monk download_start=None, download_end="2026-02-10", bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, overwrite=False, - prefer_zarr=True, publish=False, ) @@ -617,9 +322,7 @@ def test_create_artifact_rejects_download_scope_outside_request_scope(monkeypatc download_start="2026-02-01", download_end="2026-02-11", bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, overwrite=False, - prefer_zarr=True, publish=False, ) @@ -627,211 +330,85 @@ def test_create_artifact_rejects_download_scope_outside_request_scope(monkeypatc assert "download_end must be less than or equal to end" in str(exc_info.value.detail) -def test_create_artifact_delta_does_not_reuse_netcdf_artifact_when_canonical_zarr_is_required( - monkeypatch: pytest.MonkeyPatch, tmp_path: Path -) -> None: - dataset: dict[str, object] = { - "id": "chirps3_precipitation_daily", - "name": "Total precipitation (CHIRPS3)", - "variable": "precip", - "period_type": "daily", - } - created_file = tmp_path / "chirps3_precipitation_daily_2026-02-01_2026-02-10.nc" - created_file.write_text("dummy", encoding="utf-8") - zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" - - netcdf_existing = _artifact(artifact_id="existing", end="2026-02-10") - netcdf_existing.format = ArtifactFormat.NETCDF - netcdf_existing.path = str(created_file) - netcdf_existing.asset_paths = [str(created_file)] - netcdf_existing.request_scope = ArtifactRequestScope( - start="2026-01-01", - end="2026-02-10", - bbox=(1.0, 2.0, 3.0, 4.0), - ) - - lookup_preferences: list[bool] = [] - - def fake_find_existing_artifact(**kwargs: object) -> ArtifactRecord | None: - lookup_preferences.append(bool(kwargs["prefer_zarr"])) - return None if kwargs["prefer_zarr"] else netcdf_existing - - monkeypatch.setattr(services, "_find_existing_artifact", fake_find_existing_artifact) - monkeypatch.setattr(services.downloader, "download_dataset", lambda *_, **__: [created_file]) - monkeypatch.setattr(services.downloader, "build_dataset_zarr", lambda *_, **__: None) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: zarr_path) - monkeypatch.setattr(services.downloader, "get_cache_files", lambda _: [created_file]) - monkeypatch.setattr( - services, - "get_data_coverage_for_paths", - lambda *_, **__: { - "coverage": { - "temporal": {"start": "2026-01-01", "end": "2026-02-10"}, - "spatial": {"xmin": 1.0, "ymin": 2.0, "xmax": 3.0, "ymax": 4.0}, - } - }, - ) - monkeypatch.setattr(services, "_store_artifact_record", lambda record, **_: record) - - artifact = services.create_artifact( - dataset=dataset, - start="2026-01-01", - end="2026-02-10", - download_start="2026-02-01", - download_end="2026-02-10", - bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, - overwrite=False, - prefer_zarr=False, - publish=False, +def _icechunk_artifact( + *, + artifact_id: str = "ic1", + path: str = "/tmp/test.icechunk", +) -> ArtifactRecord: + return ArtifactRecord( + artifact_id=artifact_id, + dataset_id="chirps3_precipitation_daily", + dataset_name="CHIRPS3 precipitation", + variable="precip", + format=ArtifactFormat.ICECHUNK, + asset_paths=[path], + variables=["precip"], + request_scope=ArtifactRequestScope( + start="2026-01-01", + end="2026-01-10", + bbox=(1.0, 2.0, 3.0, 4.0), + ), + coverage=ArtifactCoverage( + temporal=CoverageTemporal(start="2026-01-01", end="2026-01-10"), + spatial=CoverageSpatial(xmin=1.0, ymin=2.0, xmax=3.0, ymax=4.0), + ), + created_at=datetime(2026, 1, 10, tzinfo=UTC), + publication=ArtifactPublication(status=PublicationStatus.PUBLISHED), ) - assert lookup_preferences == [True] - assert artifact.format == ArtifactFormat.ZARR - -def test_create_artifact_delta_requires_canonical_zarr_when_prefer_zarr_is_false( +def test_get_zarr_store_info_dispatches_to_icechunk_for_icechunk_artifact( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: - dataset: dict[str, object] = { - "id": "chirps3_precipitation_daily", - "name": "Total precipitation (CHIRPS3)", - "variable": "precip", - "period_type": "daily", - } - created_file = tmp_path / "chirps3_precipitation_daily_2026-02-01_2026-02-10.nc" - created_file.write_text("dummy", encoding="utf-8") - zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" - - captured_build: dict[str, object] = {} - - def fake_build_dataset_zarr(dataset_arg: dict[str, object], *, start: str | None, end: str | None) -> None: - captured_build["dataset_id"] = dataset_arg["id"] - captured_build["start"] = start - captured_build["end"] = end - - monkeypatch.setattr(services.downloader, "download_dataset", lambda *_, **__: [created_file]) - monkeypatch.setattr(services.downloader, "build_dataset_zarr", fake_build_dataset_zarr) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: zarr_path) - monkeypatch.setattr(services.downloader, "get_cache_files", lambda _: [created_file]) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) + store_path = tmp_path / "test.icechunk" + store_path.mkdir() + artifact = _icechunk_artifact(path=str(store_path)) + monkeypatch.setattr( services, - "get_data_coverage_for_paths", - lambda *_, **__: { - "coverage": { - "temporal": {"start": "2026-01-01", "end": "2026-02-10"}, - "spatial": {"xmin": 1.0, "ymin": 2.0, "xmax": 3.0, "ymax": 4.0}, - } - }, + "get_latest_artifact_for_dataset_or_404", + lambda _: artifact, ) - monkeypatch.setattr(services, "_store_artifact_record", lambda record, **_: record) - - artifact = services.create_artifact( - dataset=dataset, - start="2026-01-01", - end="2026-02-10", - download_start="2026-02-01", - download_end="2026-02-10", - bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, - overwrite=False, - prefer_zarr=False, - publish=False, - ) - - assert captured_build == { - "dataset_id": "chirps3_precipitation_daily", - "start": "2026-01-01", - "end": "2026-02-10", - } - assert artifact.format == ArtifactFormat.ZARR + called_with: list[str] = [] -def test_create_artifact_delta_fails_when_canonical_zarr_build_fails( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -) -> None: - dataset: dict[str, object] = { - "id": "chirps3_precipitation_daily", - "name": "Total precipitation (CHIRPS3)", - "variable": "precip", - "period_type": "daily", - } - created_file = tmp_path / "chirps3_precipitation_daily_2026-02-01_2026-02-10.nc" - created_file.write_text("dummy", encoding="utf-8") + def fake_icechunk_store_info(dataset_id: str, art: ArtifactRecord) -> dict: + called_with.append(dataset_id) + return {"kind": "ZarrListing", "dataset_id": dataset_id, "format": art.format, "entries": []} - def fail_build_dataset_zarr(*_: object, **__: object) -> None: - raise ValueError("zarr failed") + monkeypatch.setattr(services, "_icechunk_store_info", fake_icechunk_store_info) - monkeypatch.setattr(services.downloader, "download_dataset", lambda *_, **__: [created_file]) - monkeypatch.setattr(services.downloader, "build_dataset_zarr", fail_build_dataset_zarr) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: None) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) - - with pytest.raises(services.HTTPException) as exc_info: - services.create_artifact( - dataset=dataset, - start="2026-01-01", - end="2026-02-10", - download_start="2026-02-01", - download_end="2026-02-10", - bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, - overwrite=False, - prefer_zarr=True, - publish=False, - ) + result = services.get_dataset_zarr_store_info_or_404("chirps3_precipitation_daily") - assert exc_info.value.status_code == 409 - assert "Append sync canonical Zarr rebuild failed for requested scope: zarr failed" in str(exc_info.value.detail) + assert result["kind"] == "ZarrListing" + assert called_with == ["chirps3_precipitation_daily"] -def test_create_artifact_delta_rejects_short_rebuilt_coverage( +def test_get_zarr_store_file_dispatches_to_icechunk_for_icechunk_artifact( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: - dataset: dict[str, object] = { - "id": "chirps3_precipitation_daily", - "name": "Total precipitation (CHIRPS3)", - "variable": "precip", - "period_type": "daily", - } - created_file = tmp_path / "chirps3_precipitation_daily_2026-02-01_2026-02-10.nc" - created_file.write_text("dummy", encoding="utf-8") - zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" - - monkeypatch.setattr(services.downloader, "download_dataset", lambda *_, **__: [created_file]) - monkeypatch.setattr(services.downloader, "build_dataset_zarr", lambda *_, **__: None) - monkeypatch.setattr(services.downloader, "get_zarr_path", lambda _: zarr_path) - monkeypatch.setattr(services.downloader, "get_cache_files", lambda _: [created_file]) - monkeypatch.setattr(services, "_find_existing_artifact", lambda **_: None) + store_path = tmp_path / "test.icechunk" + store_path.mkdir() + artifact = _icechunk_artifact(path=str(store_path)) + monkeypatch.setattr( services, - "get_data_coverage_for_paths", - lambda *_, **__: { - "coverage": { - "temporal": {"start": "2026-02-01", "end": "2026-02-10"}, - "spatial": {"xmin": 1.0, "ymin": 2.0, "xmax": 3.0, "ymax": 4.0}, - } - }, + "get_latest_artifact_for_dataset_or_404", + lambda _: artifact, ) - with pytest.raises(services.HTTPException) as exc_info: - services.create_artifact( - dataset=dataset, - start="2026-01-01", - end="2026-02-10", - download_start="2026-02-01", - download_end="2026-02-10", - bbox=[1.0, 2.0, 3.0, 4.0], - country_code=None, - overwrite=False, - prefer_zarr=True, - publish=False, - ) + served_keys: list[str] = [] + + from starlette.responses import Response + + def fake_serve_icechunk_key(dataset_id: str, art: ArtifactRecord, relative_path: str) -> Response: + served_keys.append(relative_path) + return Response(content=b'{"zarr_format": 3}', media_type="application/json") + + monkeypatch.setattr(services, "_serve_icechunk_key", fake_serve_icechunk_key) + + services.get_dataset_zarr_store_file_or_404("chirps3_precipitation_daily", "t2m/zarr.json") - assert exc_info.value.status_code == 409 - assert "coverage=2026-02-01..2026-02-10" in str(exc_info.value.detail) - assert "request=2026-01-01..2026-02-10" in str(exc_info.value.detail) + assert served_keys == ["t2m/zarr.json"] diff --git a/tests/test_datasets_sync.py b/tests/test_datasets_sync.py index cbd7cd13..90b13537 100644 --- a/tests/test_datasets_sync.py +++ b/tests/test_datasets_sync.py @@ -38,7 +38,6 @@ def _artifact( dataset_name="CHIRPS3 precipitation", variable="precip", format=ArtifactFormat.ZARR, - path=path, asset_paths=[path], variables=["precip"], request_scope=ArtifactRequestScope( @@ -104,7 +103,7 @@ def test_sync_dataset_returns_up_to_date_when_no_new_period_is_due(monkeypatch: ) monkeypatch.setattr(services, "get_dataset_or_404", lambda _: _dataset_detail(dataset_id)) - result = services.sync_dataset(dataset_id=dataset_id, end="2026-01-31", prefer_zarr=True, publish=True) + result = services.sync_dataset(dataset_id=dataset_id, end="2026-01-31", publish=True) assert result.sync_id is None assert result.status == "up_to_date" @@ -137,12 +136,11 @@ def fake_create_artifact(**kwargs: object) -> ArtifactRecord: monkeypatch.setattr(services, "create_artifact", fake_create_artifact) monkeypatch.setattr(services, "get_dataset_or_404", lambda _: _dataset_detail(dataset_id)) - result = services.sync_dataset(dataset_id=dataset_id, end="2026-02-10", prefer_zarr=True, publish=True) + result = services.sync_dataset(dataset_id=dataset_id, end="2026-02-10", publish=True) assert captured["start"] == "2026-01-01" assert captured["end"] == "2026-02-10" assert captured["bbox"] == [1.0, 2.0, 3.0, 4.0] - assert captured["country_code"] == "SLE" assert result.sync_id == "a2" assert result.status == "completed" assert result.message == "Managed dataset was rematerialized against the latest planned upstream state." @@ -187,7 +185,7 @@ def fake_create_artifact(**kwargs: object) -> ArtifactRecord: monkeypatch.setattr(services, "create_artifact", fake_create_artifact) monkeypatch.setattr(services, "get_dataset_or_404", lambda _: _dataset_detail(dataset_id)) - result = services.sync_dataset(dataset_id=dataset_id, end="2026-02-10", prefer_zarr=True, publish=True) + result = services.sync_dataset(dataset_id=dataset_id, end="2026-02-10", publish=True) assert captured["start"] == "2026-01-01" assert captured["end"] == "2026-02-10" @@ -255,7 +253,7 @@ def fake_warning(message: str, *args: object) -> None: monkeypatch.setattr(services, "get_dataset_or_404", lambda _: _dataset_detail(dataset_id)) monkeypatch.setattr(sync_engine.logger, "warning", fake_warning) - result = services.sync_dataset(dataset_id=dataset_id, end="2025", prefer_zarr=True, publish=True) + result = services.sync_dataset(dataset_id=dataset_id, end="2025", publish=True) assert "download_start" in captured assert captured["download_start"] is None @@ -279,7 +277,7 @@ def test_sync_dataset_release_policy_returns_up_to_date_when_release_matches(mon ) monkeypatch.setattr(services, "get_dataset_or_404", lambda _: _dataset_detail(dataset_id)) - result = services.sync_dataset(dataset_id=dataset_id, end="2024", prefer_zarr=True, publish=True) + result = services.sync_dataset(dataset_id=dataset_id, end="2024", publish=True) assert result.sync_id is None assert result.status == "up_to_date" @@ -288,59 +286,6 @@ def test_sync_dataset_release_policy_returns_up_to_date_when_release_matches(mon assert result.sync_detail.reason == "no_new_release" -def test_sync_dataset_release_policy_clamps_future_year_by_template_availability( - monkeypatch: pytest.MonkeyPatch, -) -> None: - dataset_id = "release_dataset_sle" - latest = _artifact( - artifact_id="a1", - source_dataset_id="release_dataset_yearly", - managed_dataset_id=dataset_id, - end="2024", - ) - monkeypatch.setattr(services, "get_latest_artifact_for_dataset_or_404", lambda _: latest) - monkeypatch.setattr( - sync_engine.provider_availability, - "utc_today", - lambda: date(2026, 4, 15), - ) - monkeypatch.setattr( - services.registry_datasets, - "get_dataset", - lambda _: { - "id": "release_dataset_yearly", - "period_type": "yearly", - "sync": {"kind": "release", "availability": {"latest_year_offset": 1}}, - }, - ) - - captured: dict[str, object] = {} - - def fake_create_artifact(**kwargs: object) -> ArtifactRecord: - captured.update(kwargs) - return _artifact( - artifact_id="a2", - source_dataset_id="release_dataset_yearly", - managed_dataset_id=dataset_id, - end="2025", - ) - - monkeypatch.setattr(services, "create_artifact", fake_create_artifact) - monkeypatch.setattr(services, "get_dataset_or_404", lambda _: _dataset_detail(dataset_id)) - - result = services.sync_dataset(dataset_id=dataset_id, end="2026", prefer_zarr=True, publish=True) - - assert captured["start"] == "2026-01-01" - assert captured["end"] == "2025" - assert result.status == "completed" - assert result.sync_detail.sync_kind == SyncKind.RELEASE - assert result.sync_detail.action == SyncAction.REMATERIALIZE - assert result.sync_detail.target_end == "2025" - assert result.sync_detail.target_end_source == "request_clamped_by_availability" - assert result.sync_detail.delta_start is None - assert result.sync_detail.delta_end is None - - def test_default_hourly_target_end_is_utc_aware(monkeypatch: pytest.MonkeyPatch) -> None: class FixedDateTime(datetime): @classmethod @@ -404,7 +349,7 @@ def test_sync_dataset_static_policy_returns_not_syncable_without_period_arithmet monkeypatch.setattr(services, "create_artifact", lambda **_: pytest.fail("static sync should not create artifacts")) monkeypatch.setattr(services, "get_dataset_or_404", lambda _: _dataset_detail(dataset_id)) - result = services.sync_dataset(dataset_id=dataset_id, end="ignored", prefer_zarr=True, publish=True) + result = services.sync_dataset(dataset_id=dataset_id, end="ignored", publish=True) assert result.sync_id is None assert result.status == "not_syncable" @@ -561,7 +506,7 @@ def test_sync_route_executes_rematerialize_and_returns_structured_detail( response = client.post( f"/sync/{dataset_id}", - json={"end": "2026-02-10", "prefer_zarr": True, "publish": True}, + json={"end": "2026-02-10", "publish": True}, ) assert response.status_code == 200 @@ -574,26 +519,6 @@ def test_sync_route_executes_rematerialize_and_returns_structured_detail( assert payload["sync_detail"]["target_end"] == "2026-02-10" -def test_latest_available_end_preserves_requested_month_without_lag(monkeypatch: pytest.MonkeyPatch) -> None: - class FixedDate(date): - @classmethod - def today(cls) -> "FixedDate": - return cls(2026, 4, 15) - - monkeypatch.setattr(sync_engine.provider_availability, "utc_today", lambda: FixedDate(2026, 4, 15)) - - result = sync_engine._latest_available_end( - source_dataset={ - "id": "monthly_dataset", - "period_type": "monthly", - "sync": {"availability": {"lag_days": 0}}, - }, - requested_end="2026-05", - ) - - assert result == "2026-05" - - def test_plan_sync_marks_default_target_end_source(monkeypatch: pytest.MonkeyPatch) -> None: class FixedDate(date): @classmethod @@ -619,162 +544,201 @@ def today(cls) -> "FixedDate": assert result.delta_end == "2026-04-20" -def test_plan_sync_marks_request_target_clamped_by_availability(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr(sync_engine, "_get_dynamic_function", lambda _: lambda: "2026-03-31") +def test_latest_available_end_uses_plugin_periods_for_plugin_datasets( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """For plugin datasets, _latest_available_end calls plugin.periods() instead of + latest_available_function.""" + monkeypatch.setattr( + sync_engine, + "_plugin_latest_available_period", + lambda *, source_dataset, next_period_start, requested_end, current_end: "2026-02-08", + ) - result = sync_engine.plan_sync( + result = sync_engine._latest_available_end( source_dataset={ - "id": "chirps3_precipitation_daily", + "id": "era5land_temperature_hourly", "period_type": "daily", - "sync": { - "kind": "temporal", - "execution": "append", - "availability": { - "latest_available_function": "climate_api.providers.availability.chirps3_daily_latest_available" - }, - }, - "ingestion": {}, + "sync": {"kind": "temporal"}, + "ingestion": {"plugin": "some.Plugin", "params": {}}, }, - latest_artifact=_artifact(artifact_id="a1", end="2026-02-28"), - requested_end="2026-04-21", + requested_end="2026-02-10", + current_end="2026-02-06", ) - assert result.target_end == "2026-03-31" - assert result.target_end_source == "request_clamped_by_availability" - assert result.delta_start == "2026-03-01" - assert result.delta_end == "2026-03-31" + assert result == "2026-02-08" -def test_latest_available_end_clamps_monthly_lag_to_month_period(monkeypatch: pytest.MonkeyPatch) -> None: - class FixedDate(date): - @classmethod - def today(cls) -> "FixedDate": - return cls(2026, 4, 15) - - monkeypatch.setattr(sync_engine.provider_availability, "utc_today", lambda: FixedDate(2026, 4, 15)) +def test_latest_available_end_plugin_returns_current_end_when_no_new_periods( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When plugin.periods() returns an empty list, _latest_available_end returns current_end + so the NOOP check fires correctly.""" + monkeypatch.setattr( + sync_engine, + "_plugin_latest_available_period", + lambda *, source_dataset, next_period_start, requested_end, current_end: current_end, + ) result = sync_engine._latest_available_end( source_dataset={ - "id": "monthly_dataset", - "period_type": "monthly", - "sync": {"availability": {"lag_days": 1}}, + "id": "era5land_temperature_hourly", + "period_type": "daily", + "sync": {"kind": "temporal"}, + "ingestion": {"plugin": "some.Plugin", "params": {}}, }, - requested_end="2026-05", + requested_end="2026-02-10", + current_end="2026-02-06", ) - assert result == "2026-04" + assert result == "2026-02-06" -def test_latest_available_end_uses_provider_availability_hook(monkeypatch: pytest.MonkeyPatch) -> None: - calls: list[dict[str, object]] = [] +def test_plugin_latest_available_period_returns_last_period() -> None: + """_plugin_latest_available_period returns the last item from plugin.periods().""" - def fake_latest_available(*, dataset: dict[str, object], requested_end: str) -> str: - calls.append({"dataset": dataset, "requested_end": requested_end}) - return "2026-02-05" + class FakePlugin: + max_concurrency = 1 + commit_batch_size = 1 + rechunk_time = None - monkeypatch.setattr(sync_engine, "_get_dynamic_function", lambda _: fake_latest_available) + def probe(self, *_a: object, **_k: object) -> object: # type: ignore[override] + raise NotImplementedError - source_dataset = { - "id": "provider_dataset", - "period_type": "daily", - "sync": {"availability": {"latest_available_function": "provider.latest_available"}}, - } - result = sync_engine._latest_available_end(source_dataset=source_dataset, requested_end="2026-02-10") + def periods(self, start: str, end: str) -> list[str]: + return [d for d in ["2026-02-07", "2026-02-08", "2026-02-09"] if start <= d <= end] - assert result == "2026-02-05" - assert calls == [{"dataset": source_dataset, "requested_end": "2026-02-10"}] + def fetch_period(self, *_a: object, **_k: object) -> object: # type: ignore[override] + raise NotImplementedError + import climate_api.ingest.orchestrator as orch_mod -def test_latest_available_end_clamps_provider_availability_to_requested_end(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr(sync_engine, "_get_dynamic_function", lambda _: lambda: "2026-03-01") + orig = orch_mod.load_plugin + orch_mod.load_plugin = lambda path, params, extra_params=None: FakePlugin() # type: ignore[assignment] + try: + result = sync_engine._plugin_latest_available_period( + source_dataset={"ingestion": {"plugin": "fake.Plugin", "params": {}}}, + next_period_start="2026-02-07", + requested_end="2026-02-10", + current_end="2026-02-06", + ) + finally: + orch_mod.load_plugin = orig - result = sync_engine._latest_available_end( - source_dataset={ - "id": "provider_dataset", - "period_type": "daily", - "sync": {"availability": {"latest_available_function": "provider.latest_available"}}, - }, - requested_end="2026-02-10", - ) + assert result == "2026-02-09" - assert result == "2026-02-10" +def test_plugin_latest_available_period_returns_current_end_when_empty() -> None: + """When plugin.periods() returns [], _plugin_latest_available_period returns current_end.""" -def test_latest_available_end_wraps_provider_import_errors(monkeypatch: pytest.MonkeyPatch) -> None: - def fail_import(_: str) -> object: - raise ImportError("missing provider") + class EmptyPlugin: + max_concurrency = 1 + commit_batch_size = 1 + rechunk_time = None - monkeypatch.setattr(sync_engine, "_get_dynamic_function", fail_import) + def probe(self, *_a: object, **_k: object) -> object: # type: ignore[override] + raise NotImplementedError - with pytest.raises( - sync_engine.SyncConfigurationError, - match="Latest availability function 'provider.latest_available' failed", - ): - sync_engine._latest_available_end( - source_dataset={ - "id": "provider_dataset", - "period_type": "daily", - "sync": {"availability": {"latest_available_function": "provider.latest_available"}}, - }, - requested_end="2026-02-10", - ) + def periods(self, start: str, end: str) -> list[str]: + return [] + def fetch_period(self, *_a: object, **_k: object) -> object: # type: ignore[override] + raise NotImplementedError -def test_latest_available_end_rejects_invalid_provider_period_string(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr(sync_engine, "_get_dynamic_function", lambda _: lambda: "2026-31-99") + import climate_api.ingest.orchestrator as orch_mod - with pytest.raises( - sync_engine.SyncConfigurationError, - match="Latest availability function 'provider.latest_available' returned invalid period", - ): - sync_engine._latest_available_end( - source_dataset={ - "id": "provider_dataset", - "period_type": "daily", - "sync": {"availability": {"latest_available_function": "provider.latest_available"}}, - }, + orig = orch_mod.load_plugin + orch_mod.load_plugin = lambda *_a, **_kw: EmptyPlugin() # type: ignore[assignment] + try: + result = sync_engine._plugin_latest_available_period( + source_dataset={"ingestion": {"plugin": "fake.Plugin", "params": {}}}, + next_period_start="2026-02-07", requested_end="2026-02-10", + current_end="2026-02-06", ) + finally: + orch_mod.load_plugin = orig + assert result == "2026-02-06" -def test_latest_available_end_wraps_invalid_provider_function_path(monkeypatch: pytest.MonkeyPatch) -> None: - with pytest.raises(sync_engine.SyncConfigurationError, match="Latest availability function 'invalid_path' failed"): - sync_engine._latest_available_end( - source_dataset={ - "id": "provider_dataset", - "period_type": "daily", - "sync": {"availability": {"latest_available_function": "invalid_path"}}, - }, - requested_end="2026-02-10", + +def test_plugin_latest_available_period_returns_none_on_instantiation_failure() -> None: + """TypeError during load_plugin (e.g. plugin needs country_code) → returns None.""" + import climate_api.ingest.orchestrator as orch_mod + + orig = orch_mod.load_plugin + + def explode(path: str, params: dict, extra_params: object = None) -> object: + raise TypeError("country_code is required") + + orch_mod.load_plugin = explode # type: ignore[assignment] + try: + result = sync_engine._plugin_latest_available_period( + source_dataset={"ingestion": {"plugin": "worldpop.Plugin", "params": {}}}, + next_period_start="2026", + requested_end="2026", + current_end="2025", ) + finally: + orch_mod.load_plugin = orig + assert result is None -def test_sync_plan_route_returns_500_for_provider_hook_misconfiguration( - client: TestClient, - monkeypatch: pytest.MonkeyPatch, -) -> None: - dataset_id = "chirps3_precipitation_daily_sle" - latest = _artifact(artifact_id="a1", managed_dataset_id=dataset_id, end="2026-01-31") - monkeypatch.setattr(services, "get_latest_artifact_for_dataset_or_404", lambda _: latest) + +def test_plan_sync_uses_plugin_periods_for_availability(monkeypatch: pytest.MonkeyPatch) -> None: + """For an ICECHUNK artifact backed by a plugin, plan_sync calls plugin.periods() to + determine whether new data is available, not a static lag function.""" monkeypatch.setattr( - services.registry_datasets, - "get_dataset", - lambda _: { - "id": "chirps3_precipitation_daily", - "period_type": "daily", - "sync": {"kind": "temporal", "availability": {"latest_available_function": "provider.latest_available"}}, + sync_engine, + "_plugin_latest_available_period", + lambda *, source_dataset, next_period_start, requested_end, current_end: "2024-01-01T05", + ) + + artifact = _icechunk_artifact(artifact_id="a1", end="2024-01-01T03") + result = sync_engine.plan_sync( + source_dataset={ + "id": "era5land_temperature_hourly", + "period_type": "hourly", + "sync": {"kind": "temporal"}, + "ingestion": { + "plugin": "climate_api.ingest.plugins.era5_land.Era5LandPlugin", + "params": {"variable": "t2m"}, + }, }, + latest_artifact=artifact, + requested_end="2024-01-01T10", ) - def fail_import(_: str) -> object: - raise ImportError("missing provider") + assert result.action == "append" + assert result.target_end == "2024-01-01T05" + assert result.delta_start == "2024-01-01T04" + assert result.delta_end == "2024-01-01T05" - monkeypatch.setattr(sync_engine, "_get_dynamic_function", fail_import) - response = client.get(f"/sync/{dataset_id}/plan", params={"end": "2026-02-10"}) +def test_plan_sync_noop_when_plugin_reports_no_new_periods(monkeypatch: pytest.MonkeyPatch) -> None: + """plan_sync returns NO_OP when plugin.periods() is empty (nothing new since current_end).""" + monkeypatch.setattr( + sync_engine, + "_plugin_latest_available_period", + lambda *, source_dataset, next_period_start, requested_end, current_end: current_end, + ) - assert response.status_code == 500 - assert "Latest availability function 'provider.latest_available' failed" in response.json()["detail"] + artifact = _icechunk_artifact(artifact_id="a1", end="2024-01-01T03") + result = sync_engine.plan_sync( + source_dataset={ + "id": "era5land_temperature_hourly", + "period_type": "hourly", + "sync": {"kind": "temporal"}, + "ingestion": { + "plugin": "climate_api.ingest.plugins.era5_land.Era5LandPlugin", + "params": {"variable": "t2m"}, + }, + }, + latest_artifact=artifact, + requested_end="2024-01-01T10", + ) + + assert result.action == "no_op" def test_run_sync_raises_clear_error_when_append_invariants_are_missing(monkeypatch: pytest.MonkeyPatch) -> None: @@ -805,33 +769,190 @@ def test_run_sync_raises_clear_error_when_append_invariants_are_missing(monkeypa latest_artifact=latest_artifact, source_dataset={"id": "chirps3_precipitation_daily", "period_type": "daily", "sync": {"kind": "temporal"}}, requested_end="2026-02-11", - country_code=None, - prefer_zarr=True, publish=True, create_artifact_fn=lambda **_: pytest.fail("create_artifact should not be called"), get_dataset_fn=lambda _: pytest.fail("get_dataset should not be called"), ) -def test_sync_dataset_forwards_country_code_from_extent(monkeypatch: pytest.MonkeyPatch) -> None: - dataset_id = "worldpop_population_yearly_sle" - latest = _artifact( - artifact_id="a1", - source_dataset_id="worldpop_population_yearly", - managed_dataset_id=dataset_id, - end="2020", +# --------------------------------------------------------------------------- +# Icechunk store-based sync +# --------------------------------------------------------------------------- + + +def _icechunk_artifact( + *, + artifact_id: str, + source_dataset_id: str = "era5land_temperature_hourly", + managed_dataset_id: str = "era5land_temperature_hourly_nor", + end: str = "2024-01-01T03", + path: str = "/tmp/era5land_temperature_hourly.icechunk", +) -> ArtifactRecord: + return ArtifactRecord( + artifact_id=artifact_id, + dataset_id=source_dataset_id, + dataset_name="2m temperature (ERA5-Land)", + variable="t2m", + format=ArtifactFormat.ICECHUNK, + asset_paths=[path], + variables=["t2m"], + request_scope=ArtifactRequestScope( + start="2024-01-01T00", + end=end, + bbox=(4.0, 57.5, 31.5, 71.5), + ), + coverage=ArtifactCoverage( + temporal=CoverageTemporal(start="2024-01-01T00", end=end), + spatial=CoverageSpatial(xmin=4.0, ymin=57.5, xmax=31.5, ymax=71.5), + ), + created_at=datetime.fromisoformat("2024-01-01T04:00:00+00:00"), + publication=ArtifactPublication(status=PublicationStatus.PUBLISHED), ) + + +def test_plan_sync_uses_current_end_override_instead_of_artifact_metadata() -> None: + """current_end parameter takes precedence over latest_artifact.coverage.temporal.end.""" + artifact = _icechunk_artifact(artifact_id="a1", end="2024-01-01T03") + + result = sync_engine.plan_sync( + source_dataset={ + "id": "era5land_temperature_hourly", + "period_type": "hourly", + "sync": {"kind": "temporal"}, + }, + latest_artifact=artifact, + requested_end="2024-01-01T06", + current_end="2024-01-01T05", + ) + + assert result.current_end == "2024-01-01T05" + assert result.delta_start == "2024-01-01T06" + assert result.delta_end == "2024-01-01T06" + assert result.target_end == "2024-01-01T06" + + +def test_plan_sync_falls_back_to_artifact_end_when_no_override() -> None: + artifact = _icechunk_artifact(artifact_id="a1", end="2024-01-01T03") + + result = sync_engine.plan_sync( + source_dataset={ + "id": "era5land_temperature_hourly", + "period_type": "hourly", + "sync": {"kind": "temporal"}, + }, + latest_artifact=artifact, + requested_end="2024-01-01T06", + ) + + assert result.current_end == "2024-01-01T03" + assert result.delta_start == "2024-01-01T04" + + +def test_supports_append_returns_true_for_icechunk_format_without_yaml_execution_flag() -> None: + """ICECHUNK format always supports append — no sync.execution: append needed in YAML.""" + artifact = _icechunk_artifact(artifact_id="a1") + + result = sync_engine._supports_append( + source_dataset={"id": "era5land_temperature_hourly", "period_type": "hourly", "sync": {"kind": "temporal"}}, + latest_artifact=artifact, + ) + + assert result is True + + +def test_supports_append_requires_yaml_execution_flag_for_zarr_format() -> None: + zarr_artifact = _artifact(artifact_id="a1", end="2026-01-10") + + without_flag = sync_engine._supports_append( + source_dataset={"id": "chirps3_precipitation_daily", "period_type": "daily", "sync": {"kind": "temporal"}}, + latest_artifact=zarr_artifact, + ) + with_flag = sync_engine._supports_append( + source_dataset={ + "id": "chirps3_precipitation_daily", + "period_type": "daily", + "sync": {"kind": "temporal", "execution": "append"}, + }, + latest_artifact=zarr_artifact, + ) + + assert without_flag is False + assert with_flag is True + + +def test_sync_dataset_reads_committed_end_from_icechunk_store(monkeypatch: pytest.MonkeyPatch) -> None: + """sync_dataset passes the store-authoritative current_end to run_sync for ICECHUNK artifacts.""" + dataset_id = "era5land_temperature_hourly_nor" + latest = _icechunk_artifact(artifact_id="a1", end="2024-01-01T03") monkeypatch.setattr(services, "get_latest_artifact_for_dataset_or_404", lambda _: latest) monkeypatch.setattr( services.registry_datasets, "get_dataset", - lambda _: {"id": "worldpop_population_yearly", "period_type": "yearly", "sync": {"kind": "release"}}, + lambda _: { + "id": "era5land_temperature_hourly", + "period_type": "hourly", + "sync": {"kind": "temporal"}, + }, ) + + # Store has T00-T05; artifact record only knows about T03. + import climate_api.ingest.store as ingest_store + monkeypatch.setattr( - services, - "get_extent", - lambda: {"id": "sle", "bbox": [-13.5, 6.9, -10.1, 10.0], "country_code": "SLE"}, + ingest_store, + "read_committed_period_ids", + lambda path, period_type: { + "2024-01-01T00", + "2024-01-01T01", + "2024-01-01T02", + "2024-01-01T03", + "2024-01-01T04", + "2024-01-01T05", + }, + ) + + captured: dict[str, object] = {} + + def fake_run_sync(**kwargs: object) -> SyncResponse: + captured.update(kwargs) + return SyncResponse( + sync_id=None, + status="up_to_date", + message="ok", + dataset=_dataset_detail(dataset_id), + sync_detail=SyncDetail( + source_dataset_id="era5land_temperature_hourly", + sync_kind=SyncKind.TEMPORAL, + action=SyncAction.NO_OP, + reason="no_new_period", + message="ok", + current_start="2024-01-01T00", + current_end="2024-01-01T05", + target_end="2024-01-01T05", + target_end_source="request", + ), + ) + + monkeypatch.setattr(services, "run_sync", fake_run_sync) + + services.sync_dataset(dataset_id=dataset_id, end="2024-01-01T05", publish=False) + + assert captured["current_end"] == "2024-01-01T05" + + +def test_sync_dataset_icechunk_store_empty_uses_none_current_end(monkeypatch: pytest.MonkeyPatch) -> None: + """When the store has no committed periods yet, current_end is None (full ingest).""" + dataset_id = "era5land_temperature_hourly_nor" + latest = _icechunk_artifact(artifact_id="a1", end="2024-01-01T03") + monkeypatch.setattr(services, "get_latest_artifact_for_dataset_or_404", lambda _: latest) + monkeypatch.setattr( + services.registry_datasets, + "get_dataset", + lambda _: {"id": "era5land_temperature_hourly", "period_type": "hourly", "sync": {"kind": "temporal"}}, ) + import climate_api.ingest.store as ingest_store + + monkeypatch.setattr(ingest_store, "read_committed_period_ids", lambda *_: set()) captured: dict[str, object] = {} @@ -843,20 +964,130 @@ def fake_run_sync(**kwargs: object) -> SyncResponse: message="ok", dataset=_dataset_detail(dataset_id), sync_detail=SyncDetail( - source_dataset_id="worldpop_population_yearly", - sync_kind=SyncKind.RELEASE, + source_dataset_id="era5land_temperature_hourly", + sync_kind=SyncKind.TEMPORAL, action=SyncAction.REMATERIALIZE, - reason="new_release_available", + reason="new_periods_available", message="ok", - current_start="2020", - current_end="2020", - target_end="2021", + current_start="2024-01-01T00", + current_end="2024-01-01T03", + target_end="2024-01-01T06", target_end_source="request", ), ) monkeypatch.setattr(services, "run_sync", fake_run_sync) - services.sync_dataset(dataset_id=dataset_id, end="2021", prefer_zarr=True, publish=True) + services.sync_dataset(dataset_id=dataset_id, end="2024-01-01T06", publish=False) + + assert captured["current_end"] is None + + +def _patch_icechunk_artifact_dependencies( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + captured: dict[str, object], +) -> None: + """Patch all inline imports used by _create_icechunk_artifact.""" + import numpy as np + import xarray as xr + + import climate_api.ingest.orchestrator as orchestrator_mod + import climate_api.ingest.store as store_mod + from climate_api.ingestions import services as svc + + def fake_run_ingest_sync(**kwargs: object) -> None: + captured.update(kwargs) + Path(str(kwargs["store_path"])).mkdir(exist_ok=True) + + monkeypatch.setattr(orchestrator_mod, "run_ingest_sync", fake_run_ingest_sync) + monkeypatch.setattr(orchestrator_mod, "load_plugin", lambda path, params, extra_params=None: object()) + monkeypatch.setattr(store_mod, "open_or_create_repo", lambda _: _FakeRepo()) + monkeypatch.setattr( + svc, + "coverage_from_open_dataset", + lambda ds, **_: { + "has_data": True, + "coverage": { + "temporal": {"start": "2024-01-01T00", "end": "2024-01-01T06"}, + "spatial": {"xmin": 4.0, "ymin": 57.5, "xmax": 31.5, "ymax": 71.5}, + }, + }, + ) + monkeypatch.setattr( + xr, + "open_zarr", + lambda *_a, **_k: xr.Dataset({"t2m": xr.DataArray(np.zeros((1,)), dims=["time"])}), + ) + monkeypatch.setattr(svc, "get_extent", lambda: None) + monkeypatch.setattr(svc.downloader, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(svc, "_store_artifact_record", lambda record, **_: record) + monkeypatch.setattr(svc, "_upsert_icechunk_artifact_record", lambda record: record) + + +class _FakeRepo: + def readonly_session(self, _: str) -> "_FakeSession": + return _FakeSession() + + +class _FakeSession: + store = None + + +def test_create_icechunk_artifact_uses_ingest_start_for_delta_efficiency( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + """_create_icechunk_artifact passes ingest_start to run_ingest_sync to skip prior periods.""" + from climate_api.ingestions import services as svc + + captured: dict[str, object] = {} + _patch_icechunk_artifact_dependencies(monkeypatch, tmp_path, captured) + + dataset = { + "id": "era5land_temperature_hourly", + "name": "2m temperature (ERA5-Land)", + "variable": "t2m", + "period_type": "hourly", + "ingestion": {"plugin": "climate_api.ingest.plugins.era5_land.Era5LandPlugin", "params": {"variable": "t2m"}}, + } + + svc._create_icechunk_artifact( + dataset=dataset, # type: ignore[arg-type] + start="2024-01-01T00", + end="2024-01-01T06", + bbox=None, + request_scope=ArtifactRequestScope(start="2024-01-01T00", end="2024-01-01T06", bbox=None), + publish=False, + ingest_start="2024-01-01T04", + ) + + assert captured["start"] == "2024-01-01T04" + + +def test_create_icechunk_artifact_uses_full_start_when_no_ingest_start( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + """Without ingest_start, run_ingest_sync receives the artifact's full start.""" + from climate_api.ingestions import services as svc + + captured: dict[str, object] = {} + _patch_icechunk_artifact_dependencies(monkeypatch, tmp_path, captured) + + dataset = { + "id": "era5land_temperature_hourly", + "name": "2m temperature (ERA5-Land)", + "variable": "t2m", + "period_type": "hourly", + "ingestion": {"plugin": "climate_api.ingest.plugins.era5_land.Era5LandPlugin", "params": {"variable": "t2m"}}, + } + + svc._create_icechunk_artifact( + dataset=dataset, # type: ignore[arg-type] + start="2024-01-01T00", + end="2024-01-01T06", + bbox=None, + request_scope=ArtifactRequestScope(start="2024-01-01T00", end="2024-01-01T06", bbox=None), + publish=False, + ) - assert captured["country_code"] == "SLE" + assert captured["start"] == "2024-01-01T00" diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 398a3e40..5b2865c6 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -7,9 +7,6 @@ import pytest import xarray as xr import zarr -from fastapi import HTTPException -from topozarr.pyramid import Pyramid -from xarray import DataTree from climate_api.data_accessor.services.accessor import _coverage_from_dataset, open_zarr_dataset from climate_api.data_manager.services import downloader @@ -46,109 +43,6 @@ def test_resolve_artifacts_dir_uses_xdg_when_no_config(monkeypatch: pytest.Monke assert ingestion_services._resolve_artifacts_dir() == Path(xdg) / "climate-api" / "artifacts" -def test_download_dataset_returns_400_when_country_code_is_required(monkeypatch: pytest.MonkeyPatch) -> None: - def fake_download( - *, - start: str, - end: str, - dirname: object, - prefix: str, - overwrite: bool, - country_code: str, - ) -> None: - del start, end, dirname, prefix, overwrite, country_code - - dataset: dict[str, Any] = { - "id": "worldpop_population_yearly", - "ingestion": {"function": "ignored.path"}, - } - monkeypatch.delenv("COUNTRY_CODE", raising=False) - monkeypatch.setattr(downloader, "_get_dynamic_function", lambda _: fake_download) - - with pytest.raises(HTTPException) as exc_info: - downloader.download_dataset( - dataset=dataset, - start="2020-01-01", - end="2020-12-31", - bbox=None, - country_code=None, - overwrite=False, - background_tasks=None, - ) - - assert exc_info.value.status_code == 400 - assert "requires a country code" in str(exc_info.value.detail) - - -def test_download_dataset_returns_400_for_missing_bbox(monkeypatch: pytest.MonkeyPatch) -> None: - def fake_download( - *, - start: str, - end: str, - dirname: object, - prefix: str, - overwrite: bool, - bbox: list[float], - ) -> None: - del start, end, dirname, prefix, overwrite, bbox - - dataset: dict[str, Any] = { - "id": "chirps3_precipitation_daily", - "ingestion": {"function": "ignored.path"}, - } - monkeypatch.delenv("DOWNLOAD_BBOX", raising=False) - monkeypatch.delenv("DEFAULT_DOWNLOAD_BBOX", raising=False) - monkeypatch.setattr(downloader, "_get_dynamic_function", lambda _: fake_download) - - with pytest.raises(HTTPException) as exc_info: - downloader.download_dataset( - dataset=dataset, - start="2020-01-01", - end="2020-01-31", - bbox=None, - country_code=None, - overwrite=False, - background_tasks=None, - ) - - assert exc_info.value.status_code == 400 - assert "A bbox is required" in str(exc_info.value.detail) - - -def test_download_dataset_returns_502_for_upstream_provider_failure(monkeypatch: pytest.MonkeyPatch) -> None: - def fake_download( - *, - start: str, - end: str, - dirname: object, - prefix: str, - overwrite: bool, - country_code: str, - ) -> None: - del start, end, dirname, prefix, overwrite, country_code - raise RuntimeError("provider timeout") - - dataset: dict[str, Any] = { - "id": "worldpop_population_yearly", - "ingestion": {"function": "ignored.path"}, - } - monkeypatch.setattr(downloader, "_get_dynamic_function", lambda _: fake_download) - - with pytest.raises(HTTPException) as exc_info: - downloader.download_dataset( - dataset=dataset, - start="2020-01-01", - end="2020-12-31", - bbox=None, - country_code="SLE", - overwrite=False, - background_tasks=None, - ) - - assert exc_info.value.status_code == 502 - assert "Upstream dataset download failed: provider timeout" == str(exc_info.value.detail) - - # --------------------------------------------------------------------------- # _get_cache_prefix # --------------------------------------------------------------------------- @@ -159,113 +53,6 @@ def test_get_cache_prefix_uses_dataset_id() -> None: assert downloader._get_cache_prefix(dataset) == "chirps3_precipitation_daily" -# --------------------------------------------------------------------------- -# _validate_spatial_coverage -# --------------------------------------------------------------------------- - - -_CHIRPS3_EXTENTS: dict[str, Any] = { - "spatial": {"bbox": [-180, -50, 180, 50], "crs": "http://www.opengis.net/def/crs/OGC/1.3/CRS84"} -} -_LIMITED_LON_EXTENTS: dict[str, Any] = { - "spatial": {"bbox": [-180, -90, 60, 90], "crs": "http://www.opengis.net/def/crs/OGC/1.3/CRS84"} -} - - -def test_validate_spatial_coverage_passes_when_no_extents_declared() -> None: - dataset: dict[str, Any] = {"id": "worldpop_population_yearly", "ingestion": {}} - downloader._validate_spatial_coverage(dataset, bbox=[4.5, 57.9, 31.1, 71.2]) - - -def test_validate_spatial_coverage_passes_when_no_bbox() -> None: - dataset: dict[str, Any] = {"id": "chirps3_precipitation_daily", "ingestion": {}, "extents": _CHIRPS3_EXTENTS} - downloader._validate_spatial_coverage(dataset, bbox=None) - - -def test_validate_spatial_coverage_passes_when_template_bbox_malformed() -> None: - extents: dict[str, Any] = {"spatial": {"bbox": "not-a-list"}} - dataset: dict[str, Any] = {"id": "bad_template", "ingestion": {}, "extents": extents} - downloader._validate_spatial_coverage(dataset, bbox=[-10.0, -10.0, 10.0, 10.0]) - - -def test_validate_spatial_coverage_passes_when_bbox_inside_extents() -> None: - dataset: dict[str, Any] = {"id": "chirps3_precipitation_daily", "ingestion": {}, "extents": _CHIRPS3_EXTENTS} - downloader._validate_spatial_coverage(dataset, bbox=[-10.0, -10.0, 10.0, 10.0]) - - -def test_validate_spatial_coverage_raises_when_bbox_outside_lat_extents() -> None: - dataset: dict[str, Any] = { - "id": "chirps3_precipitation_daily", - "ingestion": {}, - "extents": _CHIRPS3_EXTENTS, - } - with pytest.raises(HTTPException) as exc_info: - downloader._validate_spatial_coverage(dataset, bbox=[4.5, 57.9, 31.1, 71.2]) - assert exc_info.value.status_code == 400 - assert "does not cover this extent" in str(exc_info.value.detail) - assert "Latitude" in str(exc_info.value.detail) - - -def test_validate_spatial_coverage_raises_when_bbox_outside_lon_extents() -> None: - dataset: dict[str, Any] = { - "id": "some_dataset", - "ingestion": {}, - "extents": _LIMITED_LON_EXTENTS, - } - with pytest.raises(HTTPException) as exc_info: - downloader._validate_spatial_coverage(dataset, bbox=[70.0, -10.0, 90.0, 10.0]) - assert exc_info.value.status_code == 400 - assert "Longitude" in str(exc_info.value.detail) - - -def test_download_dataset_validates_env_bbox_against_extents( - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Coverage validation uses the env fallback bbox when no bbox is passed in the request.""" - dataset: dict[str, Any] = { - "id": "chirps3_precipitation_daily", - "ingestion": {"function": "ignored.path"}, - "extents": _CHIRPS3_EXTENTS, - } - monkeypatch.setenv("DOWNLOAD_BBOX", "4.5,57.9,31.1,71.2") - - with pytest.raises(HTTPException) as exc_info: - downloader.download_dataset( - dataset=dataset, - start="2020-01-01", - end="2020-01-31", - bbox=None, - country_code=None, - overwrite=False, - background_tasks=None, - ) - assert exc_info.value.status_code == 400 - assert "does not cover this extent" in str(exc_info.value.detail) - - -def test_download_dataset_returns_400_when_bbox_outside_dataset_extents( - monkeypatch: pytest.MonkeyPatch, -) -> None: - dataset: dict[str, Any] = { - "id": "chirps3_precipitation_daily", - "ingestion": {"function": "ignored.path"}, - "extents": _CHIRPS3_EXTENTS, - } - - with pytest.raises(HTTPException) as exc_info: - downloader.download_dataset( - dataset=dataset, - start="2020-01-01", - end="2020-01-31", - bbox=[4.5, 57.9, 31.1, 71.2], - country_code=None, - overwrite=False, - background_tasks=None, - ) - assert exc_info.value.status_code == 400 - assert "does not cover this extent" in str(exc_info.value.detail) - - # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -282,52 +69,6 @@ def _make_dataset() -> xr.Dataset: ) -def _write_nc_files(tmp_path: Path) -> list[Path]: - paths = [] - for year in (2020, 2021): - ds = xr.Dataset( - {"pop_total": (["time", "lat", "lon"], np.ones((1, 3, 3), dtype="float32"))}, - coords={ - "time": [pd.Timestamp(f"{year}-01-01")], - "lat": [10.0, 9.0, 8.0], - "lon": [30.0, 31.0, 32.0], - }, - ) - path = tmp_path / f"my_dataset_{year}.nc" - ds.to_netcdf(path) - paths.append(path) - return paths - - -def _write_daily_nc_file(tmp_path: Path) -> list[Path]: - ds = xr.Dataset( - {"precip": (["time", "lat", "lon"], np.ones((29, 3, 3), dtype="float32"))}, - coords={ - "time": pd.date_range("2024-02-01", "2024-02-29", freq="D"), - "lat": [10.0, 9.0, 8.0], - "lon": [30.0, 31.0, 32.0], - }, - ) - path = tmp_path / "chirps3_precipitation_daily_2024-02.nc" - ds.to_netcdf(path) - return [path] - - -_FLAT_DATASET: dict[str, Any] = { - "id": "my_dataset", - "variable": "pop_total", - "period_type": "yearly", - "ingestion": {}, -} - -_PYRAMID_DATASET: dict[str, Any] = { - "id": "my_dataset", - "variable": "pop_total", - "period_type": "yearly", - "ingestion": {}, -} - - # --------------------------------------------------------------------------- # open_zarr_dataset # --------------------------------------------------------------------------- @@ -362,18 +103,17 @@ def test_open_zarr_dataset_pyramid_falls_back_to_level_0(tmp_path: Path) -> None def test_open_zarr_dataset_pyramid_with_root_time_still_opens_level_0(tmp_path: Path) -> None: - """Root-level time coord (copied for zarr-layer) does not confuse the fallback. + """Root-level time coord does not confuse the fallback. The fallback triggers on empty data_vars, not empty dims, so a root group that only has a 'time' coordinate array still falls back to /0. """ + import shutil + ds = _make_dataset() zarr_path = tmp_path / "pyramid.zarr" zarr.open_group(str(zarr_path), mode="w", zarr_format=3) ds.to_zarr(str(zarr_path / "0"), mode="w", zarr_format=3) - # Simulate what build_dataset_zarr does: copy time to root - import shutil - shutil.copytree(str(zarr_path / "0" / "time"), str(zarr_path / "time")) result = open_zarr_dataset(str(zarr_path)) @@ -383,220 +123,6 @@ def test_open_zarr_dataset_pyramid_with_root_time_still_opens_level_0(tmp_path: result.close() -# --------------------------------------------------------------------------- -# build_dataset_zarr — flat path -# --------------------------------------------------------------------------- - - -def test_build_dataset_zarr_flat_creates_zarr(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """Flat zarr is written with the correct variable and no pyramid level dirs.""" - nc_files = _write_nc_files(tmp_path) - monkeypatch.setattr(downloader, "DOWNLOAD_DIR", tmp_path) - monkeypatch.setattr(downloader, "get_cache_files", lambda _: nc_files) - - downloader.build_dataset_zarr(_FLAT_DATASET) - - zarr_path = tmp_path / "my_dataset.zarr" - assert zarr_path.exists() - assert not (zarr_path / "0").exists() - - result = open_zarr_dataset(str(zarr_path)) - try: - assert "pop_total" in result.data_vars - assert result.sizes["time"] == 2 - finally: - result.close() - - -def test_build_dataset_zarr_normalises_coordinate_names(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """Source coordinates (lat/lon, valid_time) are renamed to x/y/time.""" - # Simulate ERA5-Land source with valid_time and lon/lat - ds_era5 = xr.Dataset( - {"t2m": (["valid_time", "lat", "lon"], np.ones((2, 3, 3), dtype="float32"))}, - coords={ - "valid_time": pd.date_range("2024-01-01", periods=2, freq="h"), - "lat": [10.0, 9.0, 8.0], - "lon": [30.0, 31.0, 32.0], - }, - ) - path = tmp_path / "era5_t2m_2024-01.nc" - ds_era5.to_netcdf(path) - - dataset: dict[str, Any] = { - "id": "era5land_temperature_hourly", - "variable": "t2m", - "period_type": "hourly", - "ingestion": {}, - } - monkeypatch.setattr(downloader, "DOWNLOAD_DIR", tmp_path) - monkeypatch.setattr(downloader, "get_cache_files", lambda _: [path]) - - downloader.build_dataset_zarr(dataset) - - result = open_zarr_dataset(str(tmp_path / "era5land_temperature_hourly.zarr")) - try: - assert "time" in result.coords - assert "x" in result.coords - assert "y" in result.coords - assert "valid_time" not in result.coords - assert "lat" not in result.coords - assert "lon" not in result.coords - finally: - result.close() - - -def test_build_dataset_zarr_normalises_xy_coordinate_names(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """Source coordinates already named x/y are preserved as x/y.""" - ds_xy = xr.Dataset( - {"precip": (["time", "y", "x"], np.ones((2, 3, 3), dtype="float32"))}, - coords={ - "time": pd.date_range("2024-01-01", periods=2, freq="D"), - "y": [10.0, 9.0, 8.0], - "x": [30.0, 31.0, 32.0], - }, - ) - path = tmp_path / "chirps_xy_2024-01.nc" - ds_xy.to_netcdf(path) - - dataset: dict[str, Any] = { - "id": "chirps3_precipitation_daily", - "variable": "precip", - "period_type": "daily", - "ingestion": {}, - } - monkeypatch.setattr(downloader, "DOWNLOAD_DIR", tmp_path) - monkeypatch.setattr(downloader, "get_cache_files", lambda _: [path]) - - downloader.build_dataset_zarr(dataset) - - result = open_zarr_dataset(str(tmp_path / "chirps3_precipitation_daily.zarr")) - try: - assert "time" in result.coords - assert "x" in result.coords - assert "y" in result.coords - finally: - result.close() - - -def test_build_dataset_zarr_clips_to_requested_daily_range( - tmp_path: Path, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Provider cache files may contain full months; canonical Zarr honors request scope.""" - nc_files = _write_daily_nc_file(tmp_path) - dataset: dict[str, Any] = { - "id": "chirps3_precipitation_daily", - "variable": "precip", - "period_type": "daily", - "ingestion": {}, - } - monkeypatch.setattr(downloader, "DOWNLOAD_DIR", tmp_path) - monkeypatch.setattr(downloader, "get_cache_files", lambda _: nc_files) - - downloader.build_dataset_zarr(dataset, start="2024-02-01", end="2024-02-10") - - result = open_zarr_dataset(str(tmp_path / "chirps3_precipitation_daily.zarr")) - try: - assert result.sizes["time"] == 10 - assert pd.Timestamp(result.time.min().item()).strftime("%Y-%m-%d") == "2024-02-01" - assert pd.Timestamp(result.time.max().item()).strftime("%Y-%m-%d") == "2024-02-10" - finally: - result.close() - - -# --------------------------------------------------------------------------- -# build_dataset_zarr — pyramid path -# --------------------------------------------------------------------------- - - -def _make_fake_pyramid(ds: xr.Dataset, zarr_path: Path) -> Pyramid: - """Return a Pyramid whose .dt.to_zarr writes a minimal two-level DataTree store.""" - level0 = ds - level1 = ds.coarsen(y=2, x=2, boundary="trim").mean() # pyright: ignore[reportAttributeAccessIssue] - dt = DataTree.from_dict({"0": level0, "1": level1}) - return Pyramid(datatree=dt, encoding={}) - - -def test_build_dataset_zarr_pyramid_copies_time_to_root(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """Pyramid zarr build copies the time coordinate to the store root for zarr-layer.""" - nc_files = _write_nc_files(tmp_path) - monkeypatch.setattr(downloader, "DOWNLOAD_DIR", tmp_path) - monkeypatch.setattr(downloader, "get_cache_files", lambda _: nc_files) - monkeypatch.setattr(downloader, "_needs_pyramid", lambda *_: True) - - def fake_create_pyramid(ds: xr.Dataset, levels: int, x_dim: str, y_dim: str, method: str) -> Pyramid: - return _make_fake_pyramid(ds, tmp_path / "my_dataset.zarr") - - monkeypatch.setattr(downloader, "create_pyramid", fake_create_pyramid) - - downloader.build_dataset_zarr(_PYRAMID_DATASET) - - zarr_path = tmp_path / "my_dataset.zarr" - assert (zarr_path / "0").exists(), "pyramid level 0 should exist" - assert (zarr_path / "time").exists(), "time coordinate must be copied to zarr root" - - -def test_build_dataset_zarr_pyramid_is_openable_via_level_0(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """open_zarr_dataset returns the dataset from level 0 of the pyramid store.""" - nc_files = _write_nc_files(tmp_path) - monkeypatch.setattr(downloader, "DOWNLOAD_DIR", tmp_path) - monkeypatch.setattr(downloader, "get_cache_files", lambda _: nc_files) - monkeypatch.setattr(downloader, "_needs_pyramid", lambda *_: True) - - def fake_create_pyramid(ds: xr.Dataset, levels: int, x_dim: str, y_dim: str, method: str) -> Pyramid: - return _make_fake_pyramid(ds, tmp_path / "my_dataset.zarr") - - monkeypatch.setattr(downloader, "create_pyramid", fake_create_pyramid) - - downloader.build_dataset_zarr(_PYRAMID_DATASET) - - result = open_zarr_dataset(str(tmp_path / "my_dataset.zarr")) - try: - assert "pop_total" in result.data_vars - assert result.sizes["time"] == 2 - finally: - result.close() - - -def test_build_dataset_zarr_pyramid_normalises_coordinate_names( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch -) -> None: - """Pyramid zarr store uses canonical x/y/time coordinate names.""" - # Source files use lat/lon (WorldPop-style); canonical names must appear in the written store. - nc_files = _write_nc_files(tmp_path) - monkeypatch.setattr(downloader, "DOWNLOAD_DIR", tmp_path) - monkeypatch.setattr(downloader, "get_cache_files", lambda _: nc_files) - monkeypatch.setattr(downloader, "_needs_pyramid", lambda *_: True) - - received: list[xr.Dataset] = [] - - def fake_create_pyramid(ds: xr.Dataset, levels: int, x_dim: str, y_dim: str, method: str) -> Pyramid: - received.append(ds) - return _make_fake_pyramid(ds, tmp_path / "my_dataset.zarr") - - monkeypatch.setattr(downloader, "create_pyramid", fake_create_pyramid) - - downloader.build_dataset_zarr(_PYRAMID_DATASET) - - # The dataset handed to create_pyramid must already carry canonical names. - assert len(received) == 1 - ds_in = received[0] - assert "x" in ds_in.coords - assert "y" in ds_in.coords - assert "time" in ds_in.coords - assert "lon" not in ds_in.coords - assert "lat" not in ds_in.coords - - # The written store must also expose canonical names when opened. - result = open_zarr_dataset(str(tmp_path / "my_dataset.zarr")) - try: - assert "x" in result.coords - assert "y" in result.coords - assert "time" in result.coords - finally: - result.close() - - # --------------------------------------------------------------------------- # _coverage_from_dataset — WGS84 reprojection # --------------------------------------------------------------------------- diff --git a/tests/test_ingest_orchestrator.py b/tests/test_ingest_orchestrator.py new file mode 100644 index 00000000..2baeede4 --- /dev/null +++ b/tests/test_ingest_orchestrator.py @@ -0,0 +1,631 @@ +"""Tests for the per-period Icechunk ingest orchestrator. + +All tests use FakePlugin — no network access required. +The Icechunk store is written to a pytest tmp_path directory. +""" + +from __future__ import annotations + +import asyncio +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd +import pytest +import xarray as xr + +from climate_api.ingest.orchestrator import load_plugin, run_ingest, run_ingest_sync +from climate_api.ingest.protocol import GridSpec, IngestionPlugin +from climate_api.ingest.store import read_committed_period_ids + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_monthly_dataset(period_id: str, ny: int = 4, nx: int = 4) -> xr.Dataset: + """Return a tiny single-period dataset matching FakePlugin's grid.""" + t = pd.Timestamp(f"{period_id}-01") + data = np.random.default_rng(42).random((1, ny, nx)).astype("float32") + return xr.Dataset( + {"temperature": xr.DataArray(data, dims=["time", "y", "x"])}, + coords={"time": [t]}, + ) + + +# --------------------------------------------------------------------------- +# Fake plugin +# --------------------------------------------------------------------------- + + +class FakePlugin: + """In-memory IngestionPlugin that generates tiny xarray Datasets.""" + + max_concurrency = 2 + commit_batch_size = 2 + rechunk_time: int | None = None + + def __init__(self, periods: list[str]) -> None: + self._periods = periods + self.fetched: list[str] = [] + + def probe(self, bbox: list[float], **params: Any) -> GridSpec: + return GridSpec(shape=(4, 4), crs=4326, dtype=np.dtype("float32"), nodata=None) + + def periods(self, start: str, end: str) -> list[str]: + return [p for p in self._periods if start <= p <= end] + + def fetch_period(self, period_id: str, bbox: list[float], **params: Any) -> xr.Dataset: + self.fetched.append(period_id) + return _make_monthly_dataset(period_id) + + +# --------------------------------------------------------------------------- +# Protocol conformance +# --------------------------------------------------------------------------- + + +def test_fake_plugin_satisfies_protocol() -> None: + plugin = FakePlugin(["2024-01", "2024-02"]) + assert isinstance(plugin, IngestionPlugin) + + +# --------------------------------------------------------------------------- +# Core orchestrator tests +# --------------------------------------------------------------------------- + + +def test_run_ingest_writes_all_periods(tmp_path: Path) -> None: + plugin = FakePlugin(["2024-01", "2024-02", "2024-03"]) + store_path = tmp_path / "test.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-03", + store_path=store_path, + period_type="monthly", + ) + ) + + assert store_path.exists() + committed = read_committed_period_ids(store_path, "monthly") + assert committed == {"2024-01", "2024-02", "2024-03"} + + +def test_run_ingest_fetches_every_period_exactly_once(tmp_path: Path) -> None: + plugin = FakePlugin(["2024-01", "2024-02", "2024-03"]) + store_path = tmp_path / "test.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-03", + store_path=store_path, + period_type="monthly", + ) + ) + + assert sorted(plugin.fetched) == ["2024-01", "2024-02", "2024-03"] + + +def test_run_ingest_is_idempotent(tmp_path: Path) -> None: + plugin = FakePlugin(["2024-01", "2024-02"]) + store_path = tmp_path / "test.icechunk" + + for _ in range(2): + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-02", + store_path=store_path, + period_type="monthly", + ) + ) + + # Second run fetched nothing new. + assert sorted(plugin.fetched) == ["2024-01", "2024-02"] + committed = read_committed_period_ids(store_path, "monthly") + assert committed == {"2024-01", "2024-02"} + + +def test_run_ingest_resumes_from_store(tmp_path: Path) -> None: + """A second run reads committed period IDs from the store and only fetches missing ones.""" + plugin = FakePlugin(["2024-01", "2024-02", "2024-03", "2024-04"]) + store_path = tmp_path / "test.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-02", + store_path=store_path, + period_type="monthly", + ) + ) + assert read_committed_period_ids(store_path, "monthly") == {"2024-01", "2024-02"} + + plugin2 = FakePlugin(["2024-01", "2024-02", "2024-03", "2024-04"]) + asyncio.run( + run_ingest( + plugin=plugin2, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-04", + store_path=store_path, + period_type="monthly", + ) + ) + + # Only the two new periods were fetched; pre-existing ones were skipped. + assert sorted(plugin2.fetched) == ["2024-03", "2024-04"] + committed = read_committed_period_ids(store_path, "monthly") + assert committed == {"2024-01", "2024-02", "2024-03", "2024-04"} + + +def test_run_ingest_progress_reporting(tmp_path: Path) -> None: + plugin = FakePlugin(["2024-01", "2024-02", "2024-03"]) + store_path = tmp_path / "test.icechunk" + reports: list[dict[str, Any]] = [] + + def on_progress(done: int, total: int, message: str) -> None: + reports.append({"done": done, "total": total}) + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-03", + store_path=store_path, + period_type="monthly", + on_progress=on_progress, + ) + ) + + totals = {r["total"] for r in reports} + assert totals == {3} + final = max(r["done"] for r in reports) + assert final == 3 + + +def test_run_ingest_cursor_saved_after_each_batch(tmp_path: Path) -> None: + """commit_batch_size=2 → cursor is saved after periods 2 and 4.""" + plugin = FakePlugin(["2024-01", "2024-02", "2024-03", "2024-04"]) + store_path = tmp_path / "test.icechunk" + saved_cursors: list[dict[str, Any]] = [] + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-04", + store_path=store_path, + period_type="monthly", + save_cursor=saved_cursors.append, + ) + ) + + assert len(saved_cursors) == 2 + assert saved_cursors[0]["last_committed"] == "2024-02" + assert saved_cursors[1]["last_committed"] == "2024-04" + + +def test_run_ingest_noop_when_no_periods(tmp_path: Path) -> None: + plugin = FakePlugin([]) + store_path = tmp_path / "test.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-03", + store_path=store_path, + period_type="monthly", + ) + ) + + assert not store_path.exists() + + +def test_run_ingest_cancels_on_request(tmp_path: Path) -> None: + from climate_api.jobs.models import JobCancelledError + + # Plugin with more periods than the batch — cancellation hits mid-run. + plugin = FakePlugin(["2024-01", "2024-02", "2024-03", "2024-04", "2024-05", "2024-06"]) + store_path = tmp_path / "test.icechunk" + + call_count = 0 + + def cancel_after_two() -> bool: + nonlocal call_count + call_count += 1 + # First check (before period 0) → False; subsequent checks → True after first batch. + return call_count > 1 + + with pytest.raises(JobCancelledError): + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-06", + store_path=store_path, + period_type="monthly", + is_cancel_requested=cancel_after_two, + ) + ) + + +def test_run_ingest_preserves_committed_periods_on_fetch_error(tmp_path: Path) -> None: + """Periods committed before a fetch_period exception are retained in the store.""" + store_path = tmp_path / "test.icechunk" + + # Ingest 2024-01 and 2024-02 successfully. + seed_plugin = FakePlugin(["2024-01", "2024-02"]) + asyncio.run( + run_ingest( + plugin=seed_plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-02", + store_path=store_path, + period_type="monthly", + ) + ) + + # Now try to extend with 2024-03 through 2024-05; 2024-04 will raise. + class FailOnPeriod(FakePlugin): + def fetch_period(self, period_id: str, bbox: list[float], **params: Any) -> xr.Dataset: + if period_id == "2024-04": + raise RuntimeError("simulated fetch failure") + return super().fetch_period(period_id, bbox, **params) + + failing_plugin = FailOnPeriod(["2024-01", "2024-02", "2024-03", "2024-04", "2024-05"]) + with pytest.raises(RuntimeError, match="simulated fetch failure"): + asyncio.run( + run_ingest( + plugin=failing_plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-05", + store_path=store_path, + period_type="monthly", + ) + ) + + # The store must be valid: pre-existing + 2024-03 committed; 2024-04 and 2024-05 not. + committed = read_committed_period_ids(store_path, "monthly") + assert "2024-01" in committed + assert "2024-02" in committed + assert "2024-03" in committed + assert "2024-04" not in committed + assert "2024-05" not in committed + + +# --------------------------------------------------------------------------- +# Sync wrapper +# --------------------------------------------------------------------------- + + +def test_run_ingest_sync_wrapper(tmp_path: Path) -> None: + plugin = FakePlugin(["2024-01", "2024-02"]) + store_path = tmp_path / "test.icechunk" + + run_ingest_sync( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-02", + store_path=store_path, + period_type="monthly", + ) + + assert read_committed_period_ids(store_path, "monthly") == {"2024-01", "2024-02"} + + +# --------------------------------------------------------------------------- +# load_plugin +# --------------------------------------------------------------------------- + + +def test_load_plugin_imports_and_instantiates(tmp_path: Path) -> None: + """load_plugin can resolve built-in plugins by dotted path.""" + plugin = load_plugin("climate_api.ingest.plugins.era5_land.Era5LandPlugin", {"variable": "t2m"}) + assert isinstance(plugin, IngestionPlugin) + assert plugin.max_concurrency >= 1 # type: ignore[attr-defined] + + +def test_load_plugin_raises_for_invalid_path() -> None: + with pytest.raises(ValueError, match="Invalid plugin path"): + load_plugin("NotADottedPath", {}) + + +def test_load_plugin_raises_for_non_protocol() -> None: + with pytest.raises(TypeError, match="does not implement IngestionPlugin"): + load_plugin("builtins.dict", {}) + + +# --------------------------------------------------------------------------- +# read_committed_period_ids +# --------------------------------------------------------------------------- + + +def test_read_committed_period_ids_empty_when_no_store(tmp_path: Path) -> None: + assert read_committed_period_ids(tmp_path / "nostore.icechunk", "monthly") == set() + + +# --------------------------------------------------------------------------- +# Era5LandPlugin._build_periods (unit tests, no network) +# --------------------------------------------------------------------------- + + +def test_era5land_periods_respects_hour_component() -> None: + from climate_api.ingest.plugins.era5_land import Era5LandPlugin + + plugin = Era5LandPlugin(variable="t2m") + periods = plugin.periods("2024-01-01T06", "2024-01-01T08") + assert periods == ["2024-01-01T06", "2024-01-01T07", "2024-01-01T08"] + + +def test_era5land_periods_single_hour() -> None: + from climate_api.ingest.plugins.era5_land import Era5LandPlugin + + plugin = Era5LandPlugin(variable="t2m") + periods = plugin.periods("2024-01-01T00", "2024-01-01T00") + assert periods == ["2024-01-01T00"] + + +def test_era5land_periods_spans_months() -> None: + from climate_api.ingest.plugins.era5_land import Era5LandPlugin + + plugin = Era5LandPlugin(variable="t2m") + periods = plugin.periods("2024-01-31T23", "2024-02-01T01") + assert periods == ["2024-01-31T23", "2024-02-01T00", "2024-02-01T01"] + + +# --------------------------------------------------------------------------- +# Rechunking +# --------------------------------------------------------------------------- + + +def _time_chunk_size(store_path: Path) -> int: + """Read the time chunk size of the first data variable from the committed store.""" + import icechunk + import zarr + + repo = icechunk.Repository.open(icechunk.local_filesystem_storage(str(store_path))) + session = repo.readonly_session("main") + g = zarr.open_group(session.store, mode="r") + for name in g.array_keys(): + arr = g[name] + dims = list(arr.metadata.dimension_names or []) # type: ignore[union-attr] + if "time" in dims: + return arr.chunks[dims.index("time")] # type: ignore[union-attr] + raise AssertionError("No array with a time dimension found") + + +def test_run_ingest_rechunks_store_after_all_periods(tmp_path: Path) -> None: + """rechunk_time=N rewrites the store so the time chunk size is N after ingest.""" + plugin = FakePlugin(["2024-01", "2024-02", "2024-03", "2024-04"]) + store_path = tmp_path / "test.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-04", + store_path=store_path, + period_type="monthly", + rechunk_time=2, + ) + ) + + assert read_committed_period_ids(store_path, "monthly") == {"2024-01", "2024-02", "2024-03", "2024-04"} + assert _time_chunk_size(store_path) == 2 + + +def test_run_ingest_rechunk_preserves_all_periods_in_store(tmp_path: Path) -> None: + """After rechunking, read_committed_period_ids returns the same set as before.""" + plugin = FakePlugin(["2024-01", "2024-02", "2024-03"]) + store_path = tmp_path / "test.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-03", + store_path=store_path, + period_type="monthly", + rechunk_time=3, + ) + ) + + assert read_committed_period_ids(store_path, "monthly") == {"2024-01", "2024-02", "2024-03"} + assert _time_chunk_size(store_path) == 3 + + +def test_run_ingest_no_rechunk_when_rechunk_time_is_none(tmp_path: Path) -> None: + """Without rechunk_time the time chunk stays at 1 (one period per commit).""" + plugin = FakePlugin(["2024-01", "2024-02", "2024-03"]) + store_path = tmp_path / "test.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-03", + store_path=store_path, + period_type="monthly", + ) + ) + + assert _time_chunk_size(store_path) == 1 + + +def test_rechunk_store_noop_on_nonexistent_store(tmp_path: Path) -> None: + from climate_api.ingest.store import rechunk_store + + rechunk_store(tmp_path / "nostore.icechunk", time_chunk=12) + + +def test_rechunk_store_changes_chunk_size(tmp_path: Path) -> None: + """rechunk_store can be called directly to rechunk an existing store.""" + from climate_api.ingest.store import rechunk_store + + plugin = FakePlugin(["2024-01", "2024-02", "2024-03", "2024-04"]) + store_path = tmp_path / "test.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-04", + store_path=store_path, + period_type="monthly", + ) + ) + + assert _time_chunk_size(store_path) == 1 + + rechunk_store(store_path, time_chunk=4) + + assert _time_chunk_size(store_path) == 4 + assert read_committed_period_ids(store_path, "monthly") == {"2024-01", "2024-02", "2024-03", "2024-04"} + + +def test_rechunk_store_skips_when_no_time_dimension(tmp_path: Path) -> None: + """rechunk_store is a no-op when the store has no time dimension.""" + import icechunk + import numpy as np + import xarray as xr + + from climate_api.ingest.store import rechunk_store + + store_path = tmp_path / "static.icechunk" + storage = icechunk.local_filesystem_storage(str(store_path)) + repo = icechunk.Repository.create(storage) + ds = xr.Dataset({"elevation": xr.DataArray(np.zeros((4, 4), dtype="float32"), dims=["y", "x"])}) + session = repo.writable_session("main") + ds.to_zarr(session.store, mode="w") + session.commit("static write") + + rechunk_store(store_path, time_chunk=12) + + +def test_era5land_plugin_declares_rechunk_time() -> None: + from climate_api.ingest.plugins.era5_land import Era5LandPlugin + + plugin = Era5LandPlugin(variable="t2m") + assert plugin.rechunk_time == 12 + + +# --------------------------------------------------------------------------- +# time_dim=False (static datasets) +# --------------------------------------------------------------------------- + + +class FakeStaticPlugin(FakePlugin): + """FakePlugin variant whose probe returns time_dim=False (static dataset).""" + + def probe(self, bbox: list[float], **params: Any) -> GridSpec: + return GridSpec(shape=(4, 4), crs=4326, dtype=np.dtype("float32"), nodata=None, time_dim=False) + + def fetch_period(self, period_id: str, bbox: list[float], **params: Any) -> xr.Dataset: + self.fetched.append(period_id) + return xr.Dataset( + {"elevation": xr.DataArray(np.zeros((4, 4), dtype="float32"), dims=["y", "x"])}, + ) + + +def test_run_ingest_static_dataset_writes_once(tmp_path: Path) -> None: + """time_dim=False: the orchestrator commits only one write (no append) and the + store has no time dimension.""" + import icechunk + import zarr + + plugin = FakeStaticPlugin(["2024-01", "2024-02", "2024-03"]) + store_path = tmp_path / "static.icechunk" + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-03", + store_path=store_path, + period_type="monthly", + ) + ) + + assert store_path.exists() + # The store must exist and contain the static variable without a time axis. + repo = icechunk.Repository.open(icechunk.local_filesystem_storage(str(store_path))) + session = repo.readonly_session("main") + g = zarr.open_group(session.store, mode="r") + assert "elevation" in g + assert "time" not in g + + +# --------------------------------------------------------------------------- +# apply_transforms +# --------------------------------------------------------------------------- + + +def test_run_ingest_apply_transforms_called_per_period(tmp_path: Path) -> None: + """apply_transforms is invoked for every fetched period before writing.""" + plugin = FakePlugin(["2024-01", "2024-02"]) + store_path = tmp_path / "test.icechunk" + transform_calls: list[str] = [] + + def record_transform(ds: xr.Dataset) -> xr.Dataset: + transform_calls.append(str(ds.time.values[0])) + return ds + + asyncio.run( + run_ingest( + plugin=plugin, + params={}, + bbox=[-180, -90, 180, 90], + start="2024-01", + end="2024-02", + store_path=store_path, + period_type="monthly", + apply_transforms=record_transform, + ) + ) + + assert len(transform_calls) == 2 + committed = read_committed_period_ids(store_path, "monthly") + assert committed == {"2024-01", "2024-02"} diff --git a/tests/test_ingest_plugins.py b/tests/test_ingest_plugins.py new file mode 100644 index 00000000..ed56f849 --- /dev/null +++ b/tests/test_ingest_plugins.py @@ -0,0 +1,397 @@ +"""Unit tests for WorldPop and CHIRPS3 IngestionPlugins. + +All tests exercise the pure-Python logic (period generation, URL construction, +probe estimation) without making network calls. fetch_period tests use +monkeypatching to replace the network/rioxarray layer with a minimal stub. +""" + +from __future__ import annotations + +import math +from datetime import date +from typing import Any +from unittest.mock import MagicMock, patch + +import numpy as np +import pandas as pd +import pytest +import rioxarray # noqa: F401 # pyright: ignore[reportUnusedImport] +import xarray as xr + +from climate_api.ingest.protocol import GridSpec, IngestionPlugin + +# --------------------------------------------------------------------------- +# WorldPopPlugin +# --------------------------------------------------------------------------- + + +class TestWorldPopPlugin: + def _make_plugin(self, country_code: str = "NOR", version: str = "global2") -> Any: + from climate_api.ingest.plugins.worldpop import WorldPopPlugin + + return WorldPopPlugin(country_code=country_code, version=version) + + # Construction + + def test_country_code_uppercased(self) -> None: + plugin = self._make_plugin(country_code="nor") + assert plugin.country_code == "NOR" + + def test_satisfies_protocol(self) -> None: + plugin = self._make_plugin() + assert isinstance(plugin, IngestionPlugin) + + def test_max_concurrency_is_conservative(self) -> None: + plugin = self._make_plugin() + assert plugin.max_concurrency == 1 + + def test_commit_batch_size_is_one(self) -> None: + plugin = self._make_plugin() + assert plugin.commit_batch_size == 1 + + # URL construction + + def test_url_global2_structure(self) -> None: + from climate_api.ingest.plugins.worldpop import WorldPopPlugin + + plugin = WorldPopPlugin(country_code="NOR", version="global2") + url = plugin._url_for_year(2024) + assert "Global_2015_2030" in url + assert "/NOR/" in url + assert "nor_pop_2024" in url + assert url.endswith(".tif") + + def test_url_global1_structure(self) -> None: + from climate_api.ingest.plugins.worldpop import WorldPopPlugin + + plugin = WorldPopPlugin(country_code="GHA", version="global1") + url = plugin._url_for_year(2015) + assert "Global_2000_2020" in url + assert "/GHA/" in url + assert "gha_ppp_2015" in url + assert url.endswith(".tif") + + def test_url_unknown_version_raises(self) -> None: + from climate_api.ingest.plugins.worldpop import WorldPopPlugin + + plugin = WorldPopPlugin(country_code="NOR", version="badversion") + with pytest.raises(ValueError, match="Unknown WorldPop version"): + plugin._url_for_year(2020) + + # Period generation + + def test_build_periods_global2_basic(self) -> None: + plugin = self._make_plugin(version="global2") + periods = plugin.periods("2018", "2020") + assert periods == ["2018", "2019", "2020"] + + def test_build_periods_single_year(self) -> None: + plugin = self._make_plugin(version="global2") + assert plugin.periods("2023", "2023") == ["2023"] + + def test_build_periods_clamps_to_global2_range(self) -> None: + plugin = self._make_plugin(version="global2") + periods = plugin.periods("2010", "2035") + assert periods[0] == "2015" + assert periods[-1] == "2030" + + def test_build_periods_clamps_to_global1_range(self) -> None: + plugin = self._make_plugin(version="global1") + periods = plugin.periods("1995", "2025") + assert periods[0] == "2000" + assert periods[-1] == "2020" + + def test_build_periods_empty_when_out_of_range(self) -> None: + plugin = self._make_plugin(version="global2") + assert plugin.periods("2031", "2035") == [] + + def test_build_periods_uses_year_prefix_only(self) -> None: + # period strings like "2024-01-01" should be handled by stripping to year + plugin = self._make_plugin(version="global2") + periods = plugin.periods("2024-01-01", "2025-12-31") + assert periods == ["2024", "2025"] + + # probe / GridSpec + + def test_probe_returns_gridspec(self) -> None: + plugin = self._make_plugin() + spec = plugin.probe([4.0, 57.5, 31.5, 71.5]) + assert isinstance(spec, GridSpec) + assert spec.crs == 4326 + assert spec.time_dim is True + assert spec.dtype == np.dtype("float32") + assert spec.nodata is not None and math.isnan(spec.nodata) + assert spec.shape[0] > 0 and spec.shape[1] > 0 + + def test_probe_shape_proportional_to_bbox(self) -> None: + plugin = self._make_plugin() + small = plugin.probe([0.0, 0.0, 1.0, 1.0]) + large = plugin.probe([0.0, 0.0, 10.0, 10.0]) + # 10x wider bbox should yield ~10x more columns + assert large.shape[1] > small.shape[1] * 5 + + # fetch_period (mocked network) + + def _make_fake_da(self, ny: int = 4, nx: int = 5) -> Any: + """Build a minimal DataArray that mimics what rioxarray returns.""" + data = np.ones((1, ny, nx), dtype="float32") + y_coords = np.linspace(71.0, 57.5, ny) + x_coords = np.linspace(4.0, 31.0, nx) + da = xr.DataArray( + data, + dims=["band", "y", "x"], + coords={"band": [1], "y": y_coords, "x": x_coords}, + ) + da = da.rio.set_spatial_dims(x_dim="x", y_dim="y") + da = da.rio.write_crs("EPSG:4326") + return da + + def test_fetch_period_returns_dataset_with_time_and_pop_total(self) -> None: + from climate_api.ingest.plugins.worldpop import WorldPopPlugin + + fake_da = self._make_fake_da() + fake_resp = MagicMock() + fake_resp.raise_for_status = lambda: None + fake_resp.content = b"" + + with patch("requests.get", return_value=fake_resp), patch("rioxarray.open_rasterio", return_value=fake_da): + ds = WorldPopPlugin(country_code="NOR").fetch_period("2024", [4.0, 57.5, 31.5, 71.5]) + + assert "pop_total" in ds.data_vars + assert "time" in ds.dims + assert ds.sizes["time"] == 1 + time_val = pd.Timestamp(ds["time"].values[0]) + assert time_val.year == 2024 + + def test_fetch_period_returns_dataset_with_time_dim(self) -> None: + from climate_api.ingest.plugins.worldpop import WorldPopPlugin + + fake_da = self._make_fake_da() + fake_resp = MagicMock() + fake_resp.raise_for_status = lambda: None + fake_resp.content = b"" + + with patch("requests.get", return_value=fake_resp), patch("rioxarray.open_rasterio", return_value=fake_da): + ds = WorldPopPlugin(country_code="NOR").fetch_period("2024", [4.0, 57.5, 31.5, 71.5]) + + assert "time" in ds.dims + assert ds.sizes["time"] == 1 + # Encoding is intentionally left unset — the orchestrator's _strip_cf_encoding handles it. + + +# --------------------------------------------------------------------------- +# Chirps3Plugin +# --------------------------------------------------------------------------- + + +class TestChirps3Plugin: + def _make_plugin(self, stage: str = "final", flavor: str = "rnl") -> Any: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + return Chirps3Plugin(stage=stage, flavor=flavor) + + # Construction + + def test_default_stage_and_flavor(self) -> None: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + plugin = Chirps3Plugin() + assert plugin.stage == "final" + assert plugin.flavor == "rnl" + + def test_satisfies_protocol(self) -> None: + plugin = self._make_plugin() + assert isinstance(plugin, IngestionPlugin) + + def test_max_concurrency(self) -> None: + assert self._make_plugin().max_concurrency == 1 + + def test_commit_batch_size(self) -> None: + assert self._make_plugin().commit_batch_size == 30 + + def test_rechunk_time_declared(self) -> None: + assert self._make_plugin().rechunk_time == 30 + + def test_invalid_stage_raises(self) -> None: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + with pytest.raises(ValueError, match="stage"): + Chirps3Plugin(stage="bad") + + def test_invalid_flavor_for_final_raises(self) -> None: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + with pytest.raises(ValueError, match="flavor"): + Chirps3Plugin(stage="final", flavor="bad") + + def test_invalid_flavor_for_prelim_raises(self) -> None: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + with pytest.raises(ValueError, match="flavor"): + Chirps3Plugin(stage="prelim", flavor="rnl") + + # URL construction + + def test_url_final_rnl_structure(self) -> None: + plugin = self._make_plugin(stage="final", flavor="rnl") + url = plugin._url_for_day(date(2024, 3, 15)) + assert "final/rnl/cogs/2024" in url + assert "chirps-v3.0.rnl.2024.03.15.cog" in url + + def test_url_final_sat_structure(self) -> None: + plugin = self._make_plugin(stage="final", flavor="sat") + url = plugin._url_for_day(date(2024, 1, 1)) + assert "final/sat/cogs/2024" in url + assert "chirps-v3.0.sat.2024.01.01.cog" in url + + def test_url_prelim_structure(self) -> None: + plugin = self._make_plugin(stage="prelim", flavor="sat") + url = plugin._url_for_day(date(2024, 11, 5)) + assert "prelim/sat/2024" in url + assert "chirps-v3.0.prelim.2024.11.05.tif" in url + + # Period generation — mock _availability_cutoff to isolate periods() logic + + def _periods_with_cutoff(self, plugin: Any, start: str, end: str, cutoff: date) -> list[str]: + with patch.object(plugin, "_availability_cutoff", return_value=cutoff): + return plugin.periods(start, end) + + def test_periods_returns_daily_dates(self) -> None: + plugin = self._make_plugin() + periods = self._periods_with_cutoff(plugin, "2024-02-01", "2024-03-31", date(2024, 2, 29)) + assert periods[0] == "2024-02-01" + assert periods[-1] == "2024-02-29" + assert len(periods) == 29 + + def test_periods_respects_cutoff(self) -> None: + plugin = self._make_plugin() + periods = self._periods_with_cutoff(plugin, "2024-01-01", "2024-03-31", date(2024, 1, 31)) + assert periods[-1] == "2024-01-31" + + def test_periods_empty_when_start_after_cutoff(self) -> None: + plugin = self._make_plugin() + periods = self._periods_with_cutoff(plugin, "2024-03-01", "2024-03-31", date(2024, 2, 29)) + assert periods == [] + + def test_periods_consecutive(self) -> None: + plugin = self._make_plugin() + periods = self._periods_with_cutoff(plugin, "2024-03-01", "2024-03-05", date(2024, 3, 31)) + assert periods == ["2024-03-01", "2024-03-02", "2024-03-03", "2024-03-04", "2024-03-05"] + + def test_periods_single_day(self) -> None: + plugin = self._make_plugin() + periods = self._periods_with_cutoff(plugin, "2024-03-01", "2024-03-01", date(2024, 3, 31)) + assert periods == ["2024-03-01"] + + def test_periods_spans_months(self) -> None: + plugin = self._make_plugin() + periods = self._periods_with_cutoff(plugin, "2024-03-30", "2024-04-02", date(2024, 4, 30)) + assert periods == ["2024-03-30", "2024-03-31", "2024-04-01", "2024-04-02"] + + # _availability_cutoff — mock HTTP to test CDN probing logic + + def test_availability_cutoff_returns_first_200_month(self) -> None: + plugin = self._make_plugin() + # CDN has Feb 29 but not March 31 + def fake_head(url: str, **_: Any) -> MagicMock: + resp = MagicMock() + resp.status_code = 200 if "2024.02.29" in url else 404 + return resp + with ( + patch("climate_api.ingest.plugins.chirps3.date") as mock_date, + patch("requests.head", side_effect=fake_head), + ): + mock_date.today.return_value = date(2024, 4, 15) + mock_date.fromisoformat = date.fromisoformat + mock_date.side_effect = date + cutoff = plugin._availability_cutoff() + assert cutoff == date(2024, 2, 29) + + def test_availability_cutoff_falls_back_on_all_404(self) -> None: + plugin = self._make_plugin() + def fake_head(url: str, **_: Any) -> MagicMock: + resp = MagicMock() + resp.status_code = 404 + return resp + with ( + patch("climate_api.ingest.plugins.chirps3.date") as mock_date, + patch("requests.head", side_effect=fake_head), + ): + mock_date.today.return_value = date(2024, 6, 1) + mock_date.fromisoformat = date.fromisoformat + mock_date.side_effect = date + cutoff = plugin._availability_cutoff() + # Fallback: 3 months back from June = end of March + assert cutoff == date(2024, 3, 31) + + # probe / GridSpec + + def test_probe_returns_gridspec(self) -> None: + plugin = self._make_plugin() + spec = plugin.probe([-180.0, -50.0, 180.0, 50.0]) + assert isinstance(spec, GridSpec) + assert spec.crs == 4326 + assert spec.time_dim is True + assert spec.dtype == np.dtype("float32") + assert spec.nodata == -9999.0 + assert spec.shape[0] > 0 and spec.shape[1] > 0 + + def test_probe_shape_matches_chirps3_global_extent(self) -> None: + plugin = self._make_plugin() + # CHIRPS3 full extent: 360° × 100° at 0.05° → 7200 × 2000 + spec = plugin.probe([-180.0, -50.0, 180.0, 50.0]) + assert spec.shape == (2000, 7200) + + # fetch_period (mocked network) + + def _make_fake_chirps_da(self, ny: int = 4, nx: int = 5) -> Any: + data = np.ones((1, ny, nx), dtype="float32") * 5.0 + y_coords = np.linspace(10.0, 5.0, ny) + x_coords = np.linspace(-5.0, 5.0, nx) + da = xr.DataArray( + data, + dims=["band", "y", "x"], + coords={"band": [1], "y": y_coords, "x": x_coords}, + ) + da = da.rio.set_spatial_dims(x_dim="x", y_dim="y") + da = da.rio.write_crs("EPSG:4326") + return da + + def test_fetch_period_returns_dataset_with_time_and_precip(self) -> None: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + fake_da = self._make_fake_chirps_da() + with patch("rioxarray.open_rasterio", return_value=fake_da): + ds = Chirps3Plugin().fetch_period("2024-03-15", [-5.0, 5.0, 5.0, 10.0]) + + assert "precip" in ds.data_vars + assert "time" in ds.dims + assert ds.sizes["time"] == 1 + time_val = pd.Timestamp(ds["time"].values[0]) + assert time_val == pd.Timestamp("2024-03-15") + + def test_fetch_period_masks_nodata_as_nan(self) -> None: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + data = np.array([[[1.0, -9999.0], [3.0, 4.0]]], dtype="float32") + da = xr.DataArray(data, dims=["band", "y", "x"], coords={"band": [1], "y": [2.0, 1.0], "x": [0.0, 1.0]}) + da = da.rio.set_spatial_dims(x_dim="x", y_dim="y") + da = da.rio.write_crs("EPSG:4326") + + with patch("rioxarray.open_rasterio", return_value=da): + ds = Chirps3Plugin().fetch_period("2024-01-01", [0.0, 1.0, 1.0, 2.0]) + + precip = ds["precip"].values + assert np.isnan(precip).any(), "nodata pixels should be NaN" + assert not np.isnan(precip).all(), "non-nodata pixels should be finite" + + def test_fetch_period_returns_dataset_with_time_dim(self) -> None: + from climate_api.ingest.plugins.chirps3 import Chirps3Plugin + + fake_da = self._make_fake_chirps_da() + with patch("rioxarray.open_rasterio", return_value=fake_da): + ds = Chirps3Plugin().fetch_period("2024-03-15", [-5.0, 5.0, 5.0, 10.0]) + + assert "time" in ds.dims + assert ds.sizes["time"] == 1 + # Encoding is intentionally left unset — the orchestrator's _strip_cf_encoding handles it. diff --git a/tests/test_processing_resample.py b/tests/test_processing_resample.py index 1aa1c912..fd0d33f0 100644 --- a/tests/test_processing_resample.py +++ b/tests/test_processing_resample.py @@ -42,7 +42,6 @@ def _artifact( dataset_name=dataset_id, variable="value", format=ArtifactFormat.ZARR, - path=str(path), asset_paths=[str(path)], variables=["value"], request_scope=ArtifactRequestScope( @@ -110,7 +109,7 @@ def test_materialize_resampled_artifact_builds_daily_dataset_from_hourly_source( assert artifact.dataset_id == "era5land_temperature_hourly_1d_mean" assert artifact.coverage.temporal.start == "2026-01-01" assert artifact.coverage.temporal.end == "2026-01-02" - result = xr.open_zarr(artifact.path, consolidated=True) + result = xr.open_zarr(artifact.asset_paths[0], consolidated=True) try: assert result["value"].shape == (2, 1, 1) assert result["value"].values[:, 0, 0].tolist() == [11.5, 35.5] @@ -164,7 +163,7 @@ def test_materialize_resampled_artifact_supports_custom_frequency_dekadal( assert artifact.dataset_id == "chirps3_precipitation_daily_10d_sum" assert artifact.coverage.temporal.start == "2026-01-01" - result = xr.open_zarr(artifact.path, consolidated=True) + result = xr.open_zarr(artifact.asset_paths[0], consolidated=True) try: assert result["value"].values[:, 0, 0].tolist() == [10.0] finally: @@ -184,6 +183,54 @@ def test_materialize_resampled_artifact_returns_404_when_source_dataset_template ) +def test_materialize_resampled_artifact_reads_icechunk_source( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + source_path = tmp_path / "source.icechunk" + source_artifact = _artifact( + artifact_id="source-icechunk", + dataset_id="era5land_temperature_hourly", + managed_dataset_id="era5land_temperature_hourly_sle", + path=source_path, + start="2026-01-01T00", + end="2026-01-02T23", + ) + source_artifact = source_artifact.model_copy(update={"format": ArtifactFormat.ICECHUNK}) + + time = np.array("2026-01-01T00", dtype="datetime64[h]") + np.arange(48) + ds = xr.Dataset( + {"value": (("time", "lat", "lon"), np.arange(48, dtype=float).reshape(48, 1, 1))}, + coords={"time": time, "lat": [2.0], "lon": [1.0]}, + ) + + monkeypatch.setattr( + resample.registry_datasets, + "get_dataset", + lambda dataset_id: {"id": dataset_id, "period_type": "hourly"} if "hourly" in dataset_id else None, + ) + monkeypatch.setattr( + resample.ingestion_services, + "get_latest_artifact_for_dataset_or_404", + lambda _: source_artifact, + ) + monkeypatch.setattr(resample, "open_icechunk_dataset", lambda _: ds) + + artifact = resample.materialize_resampled_artifact( + source_dataset_id="era5land_temperature_hourly", + frequency="1D", + method="mean", + start="2026-01-01", + end="2026-01-02", + overwrite=False, + publish=False, + ) + + assert artifact.dataset_id == "era5land_temperature_hourly_1d_mean" + assert artifact.coverage.temporal.start == "2026-01-01" + assert artifact.coverage.temporal.end == "2026-01-02" + + def test_materialize_resampled_artifact_drops_incomplete_trailing_week( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, @@ -231,7 +278,7 @@ def test_materialize_resampled_artifact_drops_incomplete_trailing_week( # W03 (Jan 12-18) is incomplete — only W02 (Jan 5-11) is covered fully assert artifact.coverage.temporal.start == "2026-W02" assert artifact.coverage.temporal.end == "2026-W02" - result = xr.open_zarr(artifact.path, consolidated=True) + result = xr.open_zarr(artifact.asset_paths[0], consolidated=True) try: assert result["value"].values[:, 0, 0].tolist() == [7.0] finally: @@ -285,7 +332,7 @@ def test_materialize_resampled_artifact_drops_incomplete_leading_week( # W02 (Jan 5-11) starts Wednesday Jan 7 — incomplete leading week dropped assert artifact.coverage.temporal.start == "2026-W03" assert artifact.coverage.temporal.end == "2026-W03" - result = xr.open_zarr(artifact.path, consolidated=True) + result = xr.open_zarr(artifact.asset_paths[0], consolidated=True) try: assert result["value"].values[:, 0, 0].tolist() == [7.0] finally: @@ -388,7 +435,7 @@ def test_materialize_resampled_artifact_builds_monthly_dataset_from_daily_source # Monthly resampled timestamp is the start of the month assert artifact.coverage.temporal.start == "2026-01" assert artifact.coverage.temporal.end == "2026-01" - result = xr.open_zarr(artifact.path, consolidated=True) + result = xr.open_zarr(artifact.asset_paths[0], consolidated=True) try: assert result["value"].values[:, 0, 0].tolist() == [31.0] finally: @@ -445,7 +492,7 @@ def test_materialize_resampled_artifact_keeps_complete_week_for_daily_non_midnig assert artifact.coverage.temporal.start == "2026-W02" assert artifact.coverage.temporal.end == "2026-W02" - result = xr.open_zarr(artifact.path, consolidated=True) + result = xr.open_zarr(artifact.asset_paths[0], consolidated=True) try: assert result["value"].values[:, 0, 0].tolist() == [7.0] finally: @@ -702,7 +749,7 @@ def test_materialize_resampled_artifact_rematerializes_when_overwrite_is_true( ) assert second.artifact_id == first.artifact_id - result = xr.open_zarr(second.path, consolidated=True) + result = xr.open_zarr(second.asset_paths[0], consolidated=True) try: assert result["value"].values[:, 0, 0].tolist() == [35.5] finally: diff --git a/tests/test_provider_availability.py b/tests/test_provider_availability.py deleted file mode 100644 index 83ff8faf..00000000 --- a/tests/test_provider_availability.py +++ /dev/null @@ -1,107 +0,0 @@ -from datetime import UTC, date, datetime - -import pytest - -from climate_api.providers import availability - - -def test_chirps3_daily_latest_available_uses_previous_complete_month_after_threshold( - monkeypatch: pytest.MonkeyPatch, -) -> None: - class FixedDate(date): - @classmethod - def today(cls) -> "FixedDate": - return cls(2026, 4, 21) - - monkeypatch.setattr(availability, "utc_today", lambda: FixedDate(2026, 4, 21)) - - result = availability.chirps3_daily_latest_available( - dataset={"sync": {"availability": {"complete_month_after_day": 20}}}, - requested_end="2026-04-21", - ) - - assert result == "2026-03-31" - - -def test_chirps3_daily_latest_available_uses_month_before_previous_on_threshold_day( - monkeypatch: pytest.MonkeyPatch, -) -> None: - class FixedDate(date): - @classmethod - def today(cls) -> "FixedDate": - return cls(2026, 4, 20) - - monkeypatch.setattr(availability, "utc_today", lambda: FixedDate(2026, 4, 20)) - - result = availability.chirps3_daily_latest_available( - dataset={"sync": {"availability": {"complete_month_after_day": 20}}}, - requested_end="2026-04-20", - ) - - assert result == "2026-02-28" - - -def test_lagged_latest_available_formats_hourly_lag(monkeypatch: pytest.MonkeyPatch) -> None: - class FixedDateTime(datetime): - @classmethod - def now(cls, tz: object = None) -> "FixedDateTime": # noqa: ANN401 - return cls(2026, 4, 21, 12, 34, tzinfo=UTC) - - monkeypatch.setattr(availability, "utc_now", lambda: FixedDateTime(2026, 4, 21, 12, 34, tzinfo=UTC)) - - result = availability.lagged_latest_available( - dataset={ - "period_type": "hourly", - "sync": {"availability": {"lag_hours": 5}}, - }, - requested_end="2026-04-21T12:00:00", - ) - - assert result == "2026-04-21T07" - - -def test_lagged_latest_available_formats_daily_lag(monkeypatch: pytest.MonkeyPatch) -> None: - class FixedDate(date): - @classmethod - def today(cls) -> "FixedDate": - return cls(2026, 4, 21) - - monkeypatch.setattr(availability, "utc_today", lambda: FixedDate(2026, 4, 21)) - - result = availability.lagged_latest_available( - dataset={ - "period_type": "daily", - "sync": {"availability": {"lag_days": 2}}, - }, - requested_end="2026-04-21", - ) - - assert result == "2026-04-19" - - -def test_worldpop_release_latest_available_allows_configured_future_projection() -> None: - result = availability.worldpop_release_latest_available( - dataset={"period_type": "yearly", "sync": {"availability": {"allow_future": True}}}, - requested_end="2030", - ) - - assert result == "2030" - - -def test_lagged_latest_available_formats_yearly_offset(monkeypatch: pytest.MonkeyPatch) -> None: - class FixedDate(date): - @classmethod - def today(cls) -> "FixedDate": - return cls(2026, 4, 21) - - monkeypatch.setattr(availability, "utc_today", lambda: FixedDate(2026, 4, 21)) - - result = availability.lagged_latest_available( - dataset={ - "period_type": "yearly", - "sync": {"availability": {"latest_year_offset": 1}}, - }, - requested_end="2028", - ) - - assert result == "2025" diff --git a/tests/test_publications.py b/tests/test_publications.py index 15ea3af5..a80cea1e 100644 --- a/tests/test_publications.py +++ b/tests/test_publications.py @@ -74,7 +74,6 @@ def test_build_collection_resource_keeps_singleton_time_dimension_for_zarr( dataset_name="CHIRPS monthly total precipitation", variable="precip", format=ArtifactFormat.ZARR, - path="/tmp/chirps3_precipitation_daily_ms_sum.zarr", asset_paths=["/tmp/chirps3_precipitation_daily_ms_sum.zarr"], variables=["precip"], request_scope=ArtifactRequestScope(start="2024-01-01", end="2024-01-31"), diff --git a/tests/test_stac.py b/tests/test_stac.py index 057170f3..7e532561 100644 --- a/tests/test_stac.py +++ b/tests/test_stac.py @@ -25,7 +25,7 @@ @pytest.fixture(autouse=True) def _clear_xstac_collection_cache() -> None: - stac_services._clear_xstac_collection_cache() + stac_services._xstac_collection_cache.clear() def _artifact( @@ -49,7 +49,6 @@ def _artifact( dataset_name=dataset_name, variable=variable, format=format, - path=path, asset_paths=[path] if asset_paths is None and path is not None else (asset_paths or []), variables=[variable], request_scope=ArtifactRequestScope( @@ -100,14 +99,13 @@ def test_catalog_self_link_reflects_request_path(client: TestClient, monkeypatch assert payload["links"][0]["href"].endswith("/stac") -def test_catalog_excludes_unpublished_and_netcdf(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: +def test_catalog_excludes_unpublished(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr( ingestion_services, "list_artifacts", lambda: SimpleNamespace( items=[ _artifact(artifact_id="a1", status=PublicationStatus.UNPUBLISHED), - _artifact(artifact_id="a2", format=ArtifactFormat.NETCDF), ] ), ) @@ -119,6 +117,64 @@ def test_catalog_excludes_unpublished_and_netcdf(client: TestClient, monkeypatch assert [link for link in payload["links"] if link["rel"] == "child"] == [] +def test_catalog_includes_published_icechunk_artifact(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + ingestion_services, + "list_artifacts", + lambda: SimpleNamespace(items=[_artifact(artifact_id="a1", format=ArtifactFormat.ICECHUNK)]), + ) + monkeypatch.setattr(stac_services.registry_datasets, "get_dataset", lambda _: {"period_type": "daily"}) + monkeypatch.setattr(stac_services, "open_icechunk_dataset", lambda _: SimpleNamespace(close=lambda: None)) + monkeypatch.setattr(stac_services, "get_x_y_dims", lambda _: ("x", "y")) + monkeypatch.setattr(stac_services, "get_time_dim", lambda _: "time") + monkeypatch.setattr(stac_services, "xarray_to_stac", lambda ds, template, **kw: template) + + response = client.get("/stac/catalog.json") + + assert response.status_code == 200 + child_links = [link for link in response.json()["links"] if link["rel"] == "child"] + assert len(child_links) == 1 + assert "chirps3_precipitation_daily" in child_links[0]["href"] + + +def test_collection_uses_icechunk_dataset_for_icechunk_artifact( + monkeypatch: pytest.MonkeyPatch, +) -> None: + artifact = _artifact(artifact_id="a1", format=ArtifactFormat.ICECHUNK) + + class DummyDataset: + def close(self) -> None: + pass + + opened: list[str] = [] + + def fake_open_icechunk(path: str) -> DummyDataset: + opened.append(path) + return DummyDataset() + + template = pystac.Collection( + id="chirps3_precipitation_daily", + description="template", + extent=pystac.Extent( + spatial=pystac.SpatialExtent([[1.0, 2.0, 3.0, 4.0]]), + temporal=pystac.TemporalExtent([[datetime(2026, 1, 1, tzinfo=UTC), datetime(2026, 1, 10, tzinfo=UTC)]]), + ), + title="CHIRPS3 precipitation", + license="proprietary", + ) + template.add_asset("zarr", pystac.Asset(href="http://example.test/zarr")) + monkeypatch.setattr(stac_services, "open_icechunk_dataset", fake_open_icechunk) + monkeypatch.setattr(stac_services, "get_x_y_dims", lambda _: ("x", "y")) + monkeypatch.setattr(stac_services, "get_time_dim", lambda _: "time") + monkeypatch.setattr(stac_services, "xarray_to_stac", lambda *args, **kwargs: template) + + payload = stac_services._build_collection_with_xstac(artifact=artifact, template=template) + + assert payload["type"] == "Collection" + assert len(opened) == 1 + assert opened[0] == "/tmp/chirps3_precipitation_daily.zarr" + + def test_collection_uses_xstac_and_adds_expected_fields(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr( ingestion_services, @@ -286,7 +342,7 @@ def test_collection_sets_hourly_step_to_pt1h(client: TestClient, monkeypatch: py assert payload["cube:dimensions"]["valid_time"]["step"] == "PT1H" -def test_collection_uses_level0_href_for_pyramid_zarr_store( +def test_collection_uses_root_href_for_pyramid_zarr_store( client: TestClient, monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" @@ -321,10 +377,10 @@ def test_collection_uses_level0_href_for_pyramid_zarr_store( assert response.status_code == 200 payload = response.json() - assert payload["assets"]["zarr"]["href"].endswith("/zarr/chirps3_precipitation_daily/0") + assert payload["assets"]["zarr"]["href"].endswith("/zarr/chirps3_precipitation_daily") -def test_collection_uses_level0_href_for_remote_pyramid_zarr_store( +def test_collection_uses_root_href_for_remote_pyramid_zarr_store( client: TestClient, monkeypatch: pytest.MonkeyPatch ) -> None: artifact = _artifact(artifact_id="a1", path="s3://example-bucket/chirps3_precipitation_daily.zarr") @@ -358,7 +414,7 @@ def test_collection_uses_level0_href_for_remote_pyramid_zarr_store( assert response.status_code == 200 payload = response.json() - assert payload["assets"]["zarr"]["href"].endswith("/zarr/chirps3_precipitation_daily/0") + assert payload["assets"]["zarr"]["href"].endswith("/zarr/chirps3_precipitation_daily") def test_collection_returns_404_for_unknown_dataset(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: diff --git a/uv.lock b/uv.lock index ca02ddd9..186eecab 100644 --- a/uv.lock +++ b/uv.lock @@ -170,40 +170,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" }, ] -[[package]] -name = "cartopy" -version = "0.25.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "matplotlib" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "pyproj" }, - { name = "pyshp" }, - { name = "shapely" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3c/3f/ec3dee34237b696a486d566a6d3ae6550ae821836e0412bafdcbbec2cfd2/cartopy-0.25.0.tar.gz", hash = "sha256:55f1a390e5f3f075b221c7d91fb10258ad978db786c7930eba06eb45d28753fe", size = 10767728, upload-time = "2025-08-01T12:44:16.573Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8e/b9/0773ff8f1c755b8a362029e6910db87064d27ca021b060c48ce511ec98b7/cartopy-0.25.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a6fcd2df8039293096f957fc9c76e969b1a9715d12ab8cee1a6bdae0c6773b8b", size = 11007728, upload-time = "2025-08-01T12:44:06.64Z" }, - { url = "https://files.pythonhosted.org/packages/34/a6/75738630b7f64bca7afc6bc5de08ddf0c61f13563f2a1abf642373d1095e/cartopy-0.25.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e4def451617e6957169447fe6ecdad0f63ef2d2007e7d451dd7b9656ada57382", size = 10996613, upload-time = "2025-08-01T12:44:08.822Z" }, - { url = "https://files.pythonhosted.org/packages/19/0d/669d4bbeb36b87ba504409d85c68ec297e6f434ea6525424f8aa5f14abac/cartopy-0.25.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c388824cb13e4fa9c2901dc4fbb2dbe9547acd2f4a6a3440983d4e6c6973ae3", size = 11829044, upload-time = "2025-08-01T12:44:11.402Z" }, - { url = "https://files.pythonhosted.org/packages/01/ff/b46e2120abd99b2ff3d376dc91ed58ae8f0a052d57c242c9b140497573dd/cartopy-0.25.0-cp313-cp313-win_amd64.whl", hash = "sha256:60bad14c072d16e3c96967638cd66eb5a62cf24bc70087bcbfc6b30a3872ed26", size = 10987060, upload-time = "2025-08-01T12:44:14.222Z" }, -] - -[[package]] -name = "cdsapi" -version = "0.7.7" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "ecmwf-datastores-client" }, - { name = "requests" }, - { name = "tqdm" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/55/f3/6cb5b4bf077c441978c5d5be3a568d37e1f07f3e7177a17fa66aec2594b6/cdsapi-0.7.7.tar.gz", hash = "sha256:bc0cf807c1b78aceba6a11c3a5180f885f47f71a4e58205e324cfedcee16f10b", size = 13322, upload-time = "2025-09-30T19:11:22.404Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4f/f4/4a65460d5cb6784128019fd707a87993f378db25e796eba01400a0903f62/cdsapi-0.7.7-py2.py3-none-any.whl", hash = "sha256:384c1658572d6dc53f4111f6dd46fcdfe6fea54a688af9756d71f6fe9118b66d", size = 12293, upload-time = "2025-09-30T19:11:21.184Z" }, -] - [[package]] name = "certifi" version = "2026.1.4" @@ -402,18 +368,19 @@ name = "climate-api" version = "0.1.0a1" source = { editable = "." } dependencies = [ - { name = "dhis2eo" }, { name = "earthkit-transforms" }, { name = "fastapi" }, { name = "geojson-pydantic" }, { name = "geozarr-toolkit" }, { name = "httpx" }, + { name = "icechunk" }, { name = "jinja2" }, { name = "metpy" }, { name = "portalocker" }, { name = "pygeoapi" }, { name = "pystac" }, { name = "python-dotenv" }, + { name = "python-multipart" }, { name = "rioxarray" }, { name = "starlette" }, { name = "topozarr" }, @@ -435,18 +402,19 @@ dev = [ [package.metadata] requires-dist = [ - { name = "dhis2eo", specifier = ">=1.2.1" }, { name = "earthkit-transforms", specifier = "==0.5.*" }, { name = "fastapi", specifier = ">=0.100.0" }, { name = "geojson-pydantic", specifier = ">=2.1.0" }, { name = "geozarr-toolkit", specifier = "==0.1.*" }, { name = "httpx", specifier = ">=0.28.1" }, + { name = "icechunk", specifier = ">=2.0,<3" }, { name = "jinja2", specifier = ">=3.1" }, { name = "metpy", specifier = ">=1.7,<2" }, { name = "portalocker", specifier = ">=3.2.0" }, { name = "pygeoapi", specifier = ">=0.22.0" }, { name = "pystac", specifier = ">=1.10,<2" }, { name = "python-dotenv", specifier = ">=1.0.1" }, + { name = "python-multipart", specifier = ">=0.0.29" }, { name = "rioxarray", specifier = ">=0.17" }, { name = "starlette", specifier = ">=0.27.0" }, { name = "topozarr", specifier = "==0.0.*" }, @@ -598,27 +566,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/c3/253a89ee03fc9b9682f1541728eb66db7db22148cd94f89ab22528cd1e1b/deprecation-2.1.0-py2.py3-none-any.whl", hash = "sha256:a10811591210e1fb0e768a8c25517cabeabcba6f0bf96564f8ff45189f90b14a", size = 11178, upload-time = "2020-04-20T14:23:36.581Z" }, ] -[[package]] -name = "dhis2eo" -version = "1.2.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "aiohttp" }, - { name = "earthkit-data", extra = ["cds", "geopandas", "geotiff", "projection"] }, - { name = "ecmwf-datastores-client" }, - { name = "geopandas" }, - { name = "numpy" }, - { name = "pandas" }, - { name = "requests" }, - { name = "rioxarray" }, - { name = "xarray" }, - { name = "zarr" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/e5/3d/0a2945f5459bbda30e2d014a2c02417415374953e119f7a19d58c2bdf004/dhis2eo-1.2.1.tar.gz", hash = "sha256:dfc8687f033ae36758bdbc9f78ac648c50e1badf483356d35ebbee6d00f800e3", size = 22424, upload-time = "2026-05-07T11:52:09.867Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6d/f3/c07d21f796b809ae72858d537c68fe6be7f9f97d363ba0f76adb604b8795/dhis2eo-1.2.1-py3-none-any.whl", hash = "sha256:25fc25a8225ef7b9384a5eba6df491857562b317312cb931b8af924878fa0c52", size = 29273, upload-time = "2026-05-07T11:52:08.23Z" }, -] - [[package]] name = "donfig" version = "0.8.1.post1" @@ -661,22 +608,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/56/97/098f7faf1267031c54662d52791e219f95528806a744cae37014fc21be07/earthkit_data-0.16.8-py3-none-any.whl", hash = "sha256:afa6a5cc6119756be93951a75014d96ce62eb20a5acba05553a3ddca723e1a35", size = 378372, upload-time = "2026-02-18T13:34:18.988Z" }, ] -[package.optional-dependencies] -cds = [ - { name = "cdsapi" }, -] -geopandas = [ - { name = "geopandas" }, -] -geotiff = [ - { name = "pyproj" }, - { name = "rasterio" }, - { name = "rioxarray" }, -] -projection = [ - { name = "cartopy" }, -] - [[package]] name = "earthkit-meteo" version = "0.5.1" @@ -735,21 +666,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/a3/f58ff573ba0f678ff8116686e868afe436627b19b457a2aba62cd463c9ad/eccodes-2.47.0-py3-none-any.whl", hash = "sha256:13d0b28bd58e94e2c303f42415ca0dcc56ab3febf0f52b1fb0f1d4aa5e7db8e1", size = 91567, upload-time = "2026-04-22T11:30:06.789Z" }, ] -[[package]] -name = "ecmwf-datastores-client" -version = "0.4.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "attrs" }, - { name = "multiurl" }, - { name = "requests" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/51/60/f86eb3e57baf2b1780a7046148c234e9e57b0aeb550d30f39e50991da253/ecmwf_datastores_client-0.4.2.tar.gz", hash = "sha256:7cee1f5e5dab34edcc794cd62bee02c603fafb6f4cc2121c5f012806e0f7934d", size = 48205, upload-time = "2026-01-21T15:27:31.665Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/40/2ccf4c87a5f9c8198fe71600d5f307f5dada201c091af8774a9c1e360865/ecmwf_datastores_client-0.4.2-py3-none-any.whl", hash = "sha256:d22a675b35263286de09969502ec897da9ceb9e4c8ec4d709f7ebb3b90d3ae98", size = 29092, upload-time = "2026-01-21T15:27:30.452Z" }, -] - [[package]] name = "entrypoints" version = "0.4" @@ -1101,6 +1017,37 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] +[[package]] +name = "icechunk" +version = "2.0.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zarr" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/97/99/fabc9794c1f82e51b5c7c66301695b8fd920f72dee1726104dbdbd8df3e7/icechunk-2.0.5.tar.gz", hash = "sha256:50a2a44a1b561d3f2d3b5d19725c3759f300dc67225a2360fc793d894abfcab1", size = 3327412, upload-time = "2026-05-18T20:22:05.466Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/4e/73e0851289894ce7ba1d88e8ecc00f49dbf51220129ac3ab703a0a599eab/icechunk-2.0.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:9c7baeab6837a1e0aec8b5dd63f865b842d66d92314e02de40612190acdcaa2c", size = 16834379, upload-time = "2026-05-18T20:22:42.89Z" }, + { url = "https://files.pythonhosted.org/packages/2b/54/84e504554e9a502a4bed4d2d1e72ff0cd256e103e0c632a40e31d6c7fc9d/icechunk-2.0.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bd02bc6fb32c9c6477f45d6818e7e363d5365884d42c03d66d97801c9ed98726", size = 15538385, upload-time = "2026-05-18T20:22:33.645Z" }, + { url = "https://files.pythonhosted.org/packages/49/a9/3241119145b05beec05b45e46581faebdc02c14f5f923ebbb56ac6d0bdb0/icechunk-2.0.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de5105891c297c9a8c3ff96bef78125a576224eaf3b970e6f0aa6ec4d8cab876", size = 17229820, upload-time = "2026-05-18T20:22:23.591Z" }, + { url = "https://files.pythonhosted.org/packages/4e/ac/8392e6b23841aa81324144b7893bf7685137658848a2d8cffbd4955d89b1/icechunk-2.0.5-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:b73c5ce8987a6975970c050751d47b43af7ecaaf94c1dfa130446e3972b7f8bd", size = 16867941, upload-time = "2026-05-18T20:21:59.35Z" }, + { url = "https://files.pythonhosted.org/packages/ca/95/a323289e37ccdd6e4a7ddc8251681d921428c788c9b7f05e731d537015f0/icechunk-2.0.5-cp313-cp313-manylinux_2_28_armv7l.whl", hash = "sha256:53d7c7926251c8a45d1b526e1fe348619239c011920402f6466cfcfbcd96ba74", size = 16697076, upload-time = "2026-05-18T20:22:11.771Z" }, + { url = "https://files.pythonhosted.org/packages/2d/82/ef006b6433127a7b6aa9fbd9183cb2f4ce69df61096220e16d0292b563c9/icechunk-2.0.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:47883961b0b570eb206b3218ee285ad9e36dd407c5988ee39032356c73a90f40", size = 17086219, upload-time = "2026-05-18T20:22:53.068Z" }, + { url = "https://files.pythonhosted.org/packages/6e/42/0d1ede1a3cd383f2bf00cc8905816339885022efde31951015fb2e110885/icechunk-2.0.5-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:4779422967dcf69b2c2606211ae48b313e865a0caf3414afab790422ef1b5d7e", size = 16868204, upload-time = "2026-05-18T20:23:03.334Z" }, + { url = "https://files.pythonhosted.org/packages/ae/b3/baefe0939737277efbec5b97fe0f4286f53f81a423d7cab2e7d5128f1633/icechunk-2.0.5-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:83bf18ac31aace8325ce9cc1ee73581f007c0bf3061ad10b4b3ffff8b734977b", size = 16945422, upload-time = "2026-05-18T20:23:13.69Z" }, + { url = "https://files.pythonhosted.org/packages/be/1f/144799746b0b5269458c4a24049bae7f4d52da55735c10e173de5c76c134/icechunk-2.0.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ae7bb09eafeb714597d795e74506a9f258a798d7a26d840c37b3bfec44e988a4", size = 17637469, upload-time = "2026-05-18T20:23:23.199Z" }, + { url = "https://files.pythonhosted.org/packages/02/bc/1dec19138d4ab82175a8b2cddd24320003476c8e9e4d2cafa70b096d5b76/icechunk-2.0.5-cp313-cp313-win_amd64.whl", hash = "sha256:978ddc20fb1e6abfaacdb31810a37a83b94b488d7795e77931576f220930f72a", size = 15936134, upload-time = "2026-05-18T20:23:37.825Z" }, + { url = "https://files.pythonhosted.org/packages/e3/47/0e29dd5248dbaddef6351e9807d61c4eacae325f3b1415a2bb0ce5b2e92e/icechunk-2.0.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:2d2369db67502118150612ad481468cc0ae1333ba5cf6084179da04591e566b2", size = 16841657, upload-time = "2026-05-18T20:22:45.594Z" }, + { url = "https://files.pythonhosted.org/packages/2f/c5/9652585ee78a0f242d2c92983a550990b6a39779d6d0a7c24ab82f293d39/icechunk-2.0.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0554ef924b14f7d6369919c22f042a817f8dbc85d0b9efd793398e2d44786fa4", size = 15544742, upload-time = "2026-05-18T20:22:36.879Z" }, + { url = "https://files.pythonhosted.org/packages/59/2c/076e478b9b45616ef268bdb0473d6d0c8bfc4ad62224477c0c066b74ebe0/icechunk-2.0.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43289b68bc3ca93804a8ffd96e356f509c87bfd35c3d8202382dd97febd9957d", size = 17240728, upload-time = "2026-05-18T20:22:26.306Z" }, + { url = "https://files.pythonhosted.org/packages/fa/06/8c0f3fb0df245f42d05d4b423a92766a466ad9e36711972da84196263d14/icechunk-2.0.5-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:35812c0c3087688c422470610fa9f77f6edb2d5d7c3345e0610beb32c8028f96", size = 16883484, upload-time = "2026-05-18T20:22:02.374Z" }, + { url = "https://files.pythonhosted.org/packages/0f/6a/74440741ac30bb09c9d3fb74acb1332f218ae87faba24f6a81c8c575b9d2/icechunk-2.0.5-cp314-cp314-manylinux_2_28_armv7l.whl", hash = "sha256:ba30ea180b056c34fcf094b36fd7fb7cbd2521f778e5d1080ce9ab9b2f28204d", size = 16706325, upload-time = "2026-05-18T20:22:15.083Z" }, + { url = "https://files.pythonhosted.org/packages/4b/09/fe20275108f44a7ad8bbb904165feaf65f31796a8dad7baa764d86181fa9/icechunk-2.0.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:65075bce7674d2292344a661129fa987a579e5ab46c35862ef366b0c167e1066", size = 17099016, upload-time = "2026-05-18T20:22:56.368Z" }, + { url = "https://files.pythonhosted.org/packages/fb/f6/bd9b9d79b1a10a8382116ac018f3580f6e8021674fe53167829fba70bd7e/icechunk-2.0.5-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:231f8a46de0949da8ffd0e59446342a1051e374276385ca9a529ab593e73c7ce", size = 16878446, upload-time = "2026-05-18T20:23:06.761Z" }, + { url = "https://files.pythonhosted.org/packages/c6/af/025f9c303dca742c709a8c01f809479c3d22bdf630954f08eadb6eaace5b/icechunk-2.0.5-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:65e514a6394b1ed5f3726d7da1cc3cafae8e23c7a6b243dc793b1eb876078813", size = 16955472, upload-time = "2026-05-18T20:23:16.865Z" }, + { url = "https://files.pythonhosted.org/packages/52/b3/7429c90e9a0512f6e11443ccbc9d4ee392757b53c8604db05578457ffac9/icechunk-2.0.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e1a488e9162d64e05e995aa4015cc8118c68fc54d0ed5af4616627a66a4d1d01", size = 17646740, upload-time = "2026-05-18T20:23:30.02Z" }, + { url = "https://files.pythonhosted.org/packages/25/3c/8e086299cc1a779837e65e4152d03d02b457f8974bb388082fef20894b5e/icechunk-2.0.5-cp314-cp314-win_amd64.whl", hash = "sha256:95b1beb874ad287fcb99dfea29cd5218c795b5d9bca47b8f43ef78b8e6c5b572", size = 15945027, upload-time = "2026-05-18T20:23:40.81Z" }, +] + [[package]] name = "idna" version = "3.11" @@ -2264,15 +2211,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0c/82/a2c93e32800940d9573fb28c346772a14778b84ba7524e691b324620ab89/pyright-1.1.408-py3-none-any.whl", hash = "sha256:090b32865f4fdb1e0e6cd82bf5618480d48eecd2eb2e70f960982a3d9a4c17c1", size = 6399144, upload-time = "2026-01-08T08:07:37.082Z" }, ] -[[package]] -name = "pyshp" -version = "3.0.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/8c/20/8b07bae73aaa0c3f5a2683ba6e23b46e977e2d33a88126d56bbcc2d135cd/pyshp-3.0.3.tar.gz", hash = "sha256:bf4678b13dd53578ed87669676a2fffeccbcded1ec8ff9cafb36d1b660f4b305", size = 2192568, upload-time = "2025-11-28T17:47:31.616Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/82/06/cad54e8ce758bd836ee5411691cbd49efeb9cc611b374670fce299519334/pyshp-3.0.3-py3-none-any.whl", hash = "sha256:28c8fac8c0c25bb0fecbbfd10ead7f319c2ff2f3b0b44a94f22bd2c93510ad42", size = 58465, upload-time = "2025-11-28T17:47:30.328Z" }, -] - [[package]] name = "pystac" version = "1.14.3" @@ -2322,6 +2260,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" }, ] +[[package]] +name = "python-multipart" +version = "0.0.29" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4e/fe/70bd71a6738b09a0bdf6480ca6436b167469ca4578b2a0efbe390b4b0e70/python_multipart-0.0.29.tar.gz", hash = "sha256:643e93849196645e2dbdd81a0f8829a23123ad7f797a84a364c6fb3563f18904", size = 45678, upload-time = "2026-05-17T17:29:47.654Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/cb/769cfc37177252872a45a71f3fbdde9d51b471a3f3c14bfe95dde3407386/python_multipart-0.0.29-py3-none-any.whl", hash = "sha256:2ddcc971cef266225f54f552d8fa10bcfbb1f14446caec199060daac59ff2d69", size = 29640, upload-time = "2026-05-17T17:29:45.69Z" }, +] + [[package]] name = "pytz" version = "2026.2"