diff --git a/climate_api/data_manager/services/downloader.py b/climate_api/data_manager/services/downloader.py index 85984534..276cd7b0 100644 --- a/climate_api/data_manager/services/downloader.py +++ b/climate_api/data_manager/services/downloader.py @@ -17,6 +17,7 @@ from topozarr.coarsen import create_pyramid from climate_api import config as api_config +from climate_api.shared.time import resolve_iso_period_step, time_chunk_for_iso_step from climate_api.transforms.reproject import reproject_to_instance_crs from .utils import get_time_dim, get_x_y_dims @@ -200,11 +201,7 @@ def build_dataset_zarr(dataset: dict[str, Any], *, start: str | None = None, end else: logger.info("Building flat zarr (max dim %d pixels)", max(ds.sizes[x_dim], ds.sizes[y_dim])) - # determine optimal chunk sizes - ds_autochunk = ds.chunk("auto").unify_chunks() - uniform_chunks: dict[str, Any] = {str(dim): ds_autochunk.chunks[dim][0] for dim in ds_autochunk.dims} - time_space_chunks = _compute_time_space_chunks(ds, dataset) - uniform_chunks.update(time_space_chunks) + uniform_chunks = _compute_time_space_chunks(ds, dataset) logger.info(f"--> {uniform_chunks}") ds.attrs.update(geozarr_attrs) @@ -215,7 +212,7 @@ def build_dataset_zarr(dataset: dict[str, Any], *, start: str | None = None, end # render missing pixels as transparent — not a separately specified fillValue. for var in ds_chunked.data_vars: ds_chunked[var].encoding.pop("_FillValue", None) - ds_chunked.to_zarr(zarr_path, mode="w", consolidated=True) + ds_chunked.to_zarr(zarr_path, mode="w", zarr_format=3, consolidated=True) ds_chunked.close() ds.close() @@ -292,21 +289,30 @@ def _run_transforms(ds: xr.Dataset, dataset: dict[str, Any]) -> xr.Dataset: def _compute_time_space_chunks( ds: xr.Dataset, dataset: dict[str, Any], - max_spatial_chunk: int = 256, + max_spatial_chunk: int = 512, ) -> dict[str, int]: """Compute chunk sizes tuned for common temporal access patterns.""" chunks: dict[str, int] = {} + iso_step = resolve_iso_period_step(dataset) dim = get_time_dim(ds) - period_type = dataset["period_type"] - if period_type == "hourly": - chunks[dim] = 24 * 7 - elif period_type == "daily": - chunks[dim] = 30 - elif period_type == "monthly": + if iso_step is not None: + try: + chunks[dim] = time_chunk_for_iso_step(iso_step) + except ValueError: + logger.warning( + "Invalid ISO 8601 step %r for dataset '%s'; defaulting time chunk to 12.", + iso_step, + dataset.get("id", "?"), + ) + chunks[dim] = 12 + else: + logger.warning( + "No ISO 8601 step for dataset '%s'; defaulting time chunk to 12. " + "Declare 'extents.temporal.resolution' in the template to silence this warning.", + dataset.get("id", "?"), + ) chunks[dim] = 12 - elif period_type == "yearly": - chunks[dim] = 1 x_dim, y_dim = get_x_y_dims(ds) chunks[x_dim] = min(ds.sizes[x_dim], max_spatial_chunk) diff --git a/climate_api/shared/time.py b/climate_api/shared/time.py index f6bf7e25..922f65fe 100644 --- a/climate_api/shared/time.py +++ b/climate_api/shared/time.py @@ -1,5 +1,6 @@ """Time helpers shared across Climate API modules.""" +import logging import re from datetime import UTC, date, datetime from typing import Any, cast @@ -7,6 +8,68 @@ import numpy as np import pandas as pd +logger = logging.getLogger(__name__) + +_ISO_DURATION_RE = re.compile(r"^P(?:(\d+)Y)?(?:(\d+)M)?(?:(\d+)W)?(?:(\d+)D)?(?:T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?)?$") + + +def resolve_iso_period_step(dataset: dict[str, Any]) -> str | None: + """Return the ISO 8601 duration step from ``extents.temporal.resolution``. + + Returns None if the field is absent or not a valid ISO 8601 duration, logging + a warning in the latter case. + """ + extents = dataset.get("extents") + if not isinstance(extents, dict): + return None + temporal = extents.get("temporal") + if not isinstance(temporal, dict): + return None + resolution = temporal.get("resolution") + if not resolution: + return None + resolution_str = str(resolution) + try: + _iso_step_to_approx_hours(resolution_str) + except ValueError: + logger.warning("Invalid ISO 8601 duration in extents.temporal.resolution: %r", resolution_str) + return None + return resolution_str + + +def _iso_step_to_approx_hours(step: str) -> float: + """Return the approximate duration in hours for an ISO 8601 duration string. + + Months and years use calendar averages (30.4375 days/month, 365.25 days/year). + Raises ValueError for unrecognised formats. + """ + m = _ISO_DURATION_RE.fullmatch(step) + if not m: + raise ValueError(f"Cannot parse ISO 8601 duration: '{step}'") + years, months, weeks, days, hours, minutes, seconds = (int(g or 0) for g in m.groups()) + result = ( + years * 365.25 * 24 + months * 30.4375 * 24 + weeks * 7 * 24 + days * 24 + hours + minutes / 60 + seconds / 3600 + ) + if result <= 0: + raise ValueError(f"ISO 8601 duration '{step}' resolves to zero — cannot derive chunk size") + return result + + +def time_chunk_for_iso_step(step: str) -> int: + """Return a suitable zarr time chunk size for a given ISO 8601 duration step. + + Targets roughly one week of data for sub-daily steps, one month for daily/sub-weekly + steps, and one year for weekly and coarser steps. This keeps individual chunk files + at a manageable size while covering a natural analysis window in one read. + """ + hours = _iso_step_to_approx_hours(step) + if hours < 24: + return max(1, round(24 * 7 / hours)) # ~1 week + if hours < 24 * 7: + return max(1, round(24 * 30 / hours)) # ~1 month + return max(1, round(24 * 365.25 / hours)) # ~1 year + + _WEEKLY_PERIOD_PATTERN = re.compile(r"^(?P\d{4})-W(?P\d{2})$") diff --git a/climate_api/stac/services.py b/climate_api/stac/services.py index 8c17622f..c8801196 100644 --- a/climate_api/stac/services.py +++ b/climate_api/stac/services.py @@ -18,7 +18,7 @@ from climate_api.data_registry.services import datasets as registry_datasets from climate_api.ingestions import services as ingestion_services from climate_api.ingestions.schemas import ArtifactFormat, ArtifactRecord, PublicationStatus -from climate_api.shared.time import parse_period_string_to_datetime +from climate_api.shared.time import parse_period_string_to_datetime, resolve_iso_period_step CATALOG_ID = "climate-api" CATALOG_TITLE = "DHIS2 Climate API" @@ -118,7 +118,7 @@ def build_collection(dataset_id: str, request: Request) -> dict[str, object]: collection_payload["license"] = template.license _remove_helper_variables(collection_payload) _round_spatial_steps(collection_payload) - _override_time_step(collection_payload, _period_step(source_dataset.get("period_type"))) + _override_time_step(collection_payload, resolve_iso_period_step(source_dataset)) _override_spatial_extent_from_artifact(collection_payload, artifact) _override_temporal_extent_from_artifact(collection_payload, artifact) _sanitize_variable_attrs(collection_payload) @@ -318,18 +318,6 @@ def _abs_url(request: Request, path: str) -> str: return f"{str(request.base_url).rstrip('/')}{path}" -def _period_step(period_type: object) -> str | None: - if period_type == "hourly": - return "PT1H" - if period_type == "daily": - return "P1D" - if period_type == "monthly": - return "P1M" - if period_type == "yearly": - return "P1Y" - return None - - def _override_time_step(collection: dict[str, Any], step: str | None) -> None: if step is None: return diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 00000000..0f2a8a71 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,293 @@ +# Architecture + +This document explains how the Climate API is structured, why it is structured that way, and what the consequences are of each design decision. It is written for developers who will maintain or extend the platform over time. + +--- + +## Core concepts + +The platform has four first-class concepts. Understanding the distinction between them is the foundation for understanding everything else. + +### Dataset template + +A **template** is a YAML blueprint that describes a data source. Built-ins live in `climate_api/data/datasets/` inside the package (loaded via `importlib.resources`). Custom templates live in `{plugins_dir}/datasets/` where `plugins_dir` is set in `climate-api.yaml`. It has no state — it describes what _could_ be ingested, not what _has been_ ingested. + +A template defines: + +- the dataset identifier and display metadata +- the variable name, units, and period type +- how to download the data (`ingestion.function`) +- what transforms to apply (`transforms`) +- what sync strategy to use (`sync.kind`, `sync.execution`) + +Templates are config, not code. If a template needs custom logic, the logic goes into a Python function referenced by dotted path from the YAML. + +### Artifact + +An **artifact** is the internal record of a completed data ingestion. It is the persistence layer — not a public API concept. Each ingestion produces exactly one artifact, which records: + +- what dataset template it came from +- the exact spatial extent and time range that was materialized +- where the data lives on disk (path to the zarr store or netCDF files) +- when it was created +- whether it has been published + +Multiple artifacts can exist for the same dataset template if data was ingested at different times (they form the version history). The most recent artifact for a given `dataset_id` is what the public API serves. + +Artifacts are stored in `{data_dir}/artifacts/records.json`, where `data_dir` is the path configured in `climate-api.yaml`. This is an internal implementation detail — consumers should never depend on artifact IDs or artifact paths directly. + +### Managed dataset + +A **managed dataset** is the public-facing view of the most recent artifact for a given template. It is what `/datasets`, `/zarr`, `/stac`, and `/ogcapi` expose. When an operator ingests or syncs a dataset, the managed dataset view updates to reflect the new artifact — the public ID stays stable. + +The relationship is: one template → many artifacts over time → one managed dataset (the latest). + +### Extent + +The **extent** is the spatial bounding box configured for this Climate API instance. It is set once in `climate-api.yaml` and does not change at runtime. Every ingestion is automatically scoped to this extent — operators do not specify it per-request. + +This is a deliberate design constraint: each instance serves one place. A Sierra Leone instance serves Sierra Leone. Multi-country coverage requires multiple instances. + +--- + +## Data lifecycle + +``` +Template (YAML) + │ + │ POST /ingestions (or POST /sync) + ▼ +Ingestion + │ call ingestion function → NetCDF files on disk + │ apply transforms + │ reproject to instance CRS + │ write GeoZarr store + │ compute coverage (spatial + temporal extent of actual data) + ▼ +Artifact (internal record) + │ + │ publish=true + ▼ +Managed dataset (public API) + ├── /datasets/{id} — native metadata + ├── /zarr/{id} — raw zarr store access + ├── /stac/collections/{id} — STAC discovery + └── /ogcapi/collections/{id} — OGC API access +``` + +The ingestion function is called identically by both `POST /ingestions` and `POST /sync` — the framework invokes it the same way regardless of the trigger. A correctly written ingestion function works for both without any changes. + +The framework is responsible for everything from "write zarr" onward. An ingestion function only needs to write NetCDF files to a given directory. The framework then: + +1. reads and normalises the coordinate names +2. applies transforms (unit conversion, etc.) +3. reprojects to the instance CRS +4. builds the zarr store with auto-computed chunking +5. writes GeoZarr root attributes (`spatial:bbox`, `proj:code`) so map clients can position tiles +6. computes artifact coverage (spatial bounds + time range) from the written data +7. stores the artifact record +8. publishes the managed dataset through pygeoapi if `publish=true` + +This division means that ingestion functions do not need to know about zarr conventions, STAC, OGC, or pygeoapi. They write data files; the framework handles everything else. + +--- + +## Sync kinds + +The `sync.kind` field in a template determines how a managed dataset is kept current. + +| `sync.kind` | On each sync | Use when | +| ----------- | --------------------------------------- | ----------------------------------------------------------------- | +| `temporal` | Append new time steps, or rematerialize | Historical record that grows over time (CHIRPS, ERA5-Land) | +| `release` | Rematerialize if a newer release exists | Versioned releases where each year/version is discrete (WorldPop) | +| `static` | Never synced | One-time fixed dataset with no updates | + +### The sync execution modes + +Within `sync.kind: temporal`, two execution modes control what happens when new data is available: + +- `append` — downloads only the missing time range and appends it to the existing artifact +- `rematerialize` — discards the existing artifact and rebuilds it from scratch + +`append` is efficient for large historical datasets (avoid re-downloading years of data on each sync). `rematerialize` is appropriate when old data may change retroactively (e.g. reanalysis products that are corrected after the fact). + +### Availability clamping + +Providers publish data on a delay. The `sync.availability` block in a template tells the sync engine how far back from today data is reliably available: + +```yaml +sync: + kind: temporal + execution: append + availability: + latest_available_function: climate_api.providers.availability.lagged_latest_available + lag_hours: 120 +``` + +Before executing a sync, the engine calls the availability function to clamp the target end date. This prevents requesting data that has not yet been published, which would leave temporal gaps. + +--- + +## The plugin contract + +The platform has four extension points. Each one has a narrow contract — the framework handles everything else automatically. + +### Ingestion function + +```python +def download( + *, + start: str, # ISO 8601 date or datetime + end: str, + dirname: Path, # write output files here + prefix: str, # use as filename prefix, e.g. f"{prefix}_{year}.nc" + overwrite: bool, + bbox: list[float], # optional — only if the source needs a spatial filter + **kwargs, # default_params from the YAML template +) -> None: + # Write one or more NetCDF files to dirname. +``` + +The function writes NetCDF files. The framework reads them, normalises coordinate names, applies transforms, reprojects to the instance CRS, builds the zarr, writes GeoZarr attributes, computes coverage, and registers the artifact. + +The ingestion function is called identically by `POST /ingestions` and `POST /sync`. The caller makes no difference to the function — it always receives the same parameters. + +**Reusing ingestion logic across templates**: multiple YAML templates can reference the same Python function and differentiate via `default_params`. This is the intended pattern for sources that have the same fetching logic but expose different variables: + +```yaml +# era5land_temperature_hourly.yaml +ingestion: + function: dhis2eo.data.era5_land.download + default_params: + variable: 2m_temperature + +# era5land_precipitation_hourly.yaml +ingestion: + function: dhis2eo.data.era5_land.download + default_params: + variable: total_precipitation +``` + +No framework changes are needed to support a new variable from the same source. + +### Transform function + +```python +def my_transform(ds: xr.Dataset, dataset: dict) -> xr.Dataset: + # Receive the dataset after download, return a modified dataset. + # Modify ds[dataset["variable"]] values and variable attributes. + # Do not modify dataset-level ds.attrs — the framework manages those. +``` + +Transforms are applied in order after the ingestion function returns, before the zarr is written. They receive the full xarray Dataset and the template dict. They return a modified Dataset. They do not write to disk. + +### Process execution function + +```python +def execute(*, source_dataset_id: str, **kwargs) -> dict: + # Run a named operation (e.g. temporal resampling). + # Return a JSON-serialisable result dict. +``` + +Processes are named operations triggered via `POST /processes/{id}/execution`. They are broader than single-dataset transforms — they can read one managed dataset and produce another (e.g. daily → monthly aggregation). + +--- + +## The transform pipeline + +Transforms are applied at a consistent point in the ingestion lifecycle: + +1. ingestion function writes raw NetCDF files to disk +2. framework reads and normalises the data into an xarray Dataset +3. `_run_transforms(ds, dataset)` applies each declared transform in order +4. result is reprojected to instance CRS +5. zarr store is written with auto-computed chunking +6. framework writes GeoZarr root attributes +7. framework computes coverage from the zarr + +Transforms see post-download, pre-reproject data. They should only modify data values and variable-level attributes. The framework writes dataset-level attributes (GeoZarr) after the transform pipeline completes. + +--- + +## GeoZarr root attributes + +Every zarr artifact must have GeoZarr root attributes for map rendering to work correctly. These are written into `zarr.json` at the store root: + +- `spatial:bbox` — `[xmin, ymin, xmax, ymax]` in the native CRS +- `proj:code` — the CRS EPSG code (e.g. `EPSG:32633` for UTM, `EPSG:4326` for WGS84) +- `zarr_conventions` — GeoZarr convention declaration + +The map viewer reads `spatial:bbox` and `proj:code` to determine where to position tiles on the map. + +**The framework writes these attributes — plugins do not.** They are written in `build_dataset_zarr` after transforms and reprojection, using the actual coordinate bounds of the final written data and the instance CRS. + +--- + +## CRS handling + +The instance CRS is configured in `climate-api.yaml`: + +```yaml +extent: + name: Norway + bbox: [3.0, 57.0, 32.0, 72.5] + crs: EPSG:32633 # optional; defaults to EPSG:4326 +``` + +Downloaded data is reprojected from the source CRS (`source_crs` in the template, default `EPSG:4326`) to the instance CRS during ingestion. The stored zarr is always in the instance CRS. + +If no `crs` is set in the config, data is stored in `EPSG:4326` (WGS84). This is the correct default for instances that do not need a metric CRS. + +--- + +## Artifact deduplication and version history + +When a new ingestion request arrives, the framework checks whether an existing artifact already covers the requested scope: + +- same `dataset_id` +- same bbox (from the configured extent) +- overlapping time range + +If a match exists and `overwrite=false`, the existing artifact is returned without re-downloading. If `overwrite=true`, the existing artifact is replaced. + +The artifact store keeps the full history of records for sync deduplication and provenance. Old artifacts are not deleted automatically. For long-running instances, `records.json` grows over time. The long-term direction is a proper transactional store, but for the current scale (tens of artifacts per instance) a JSON file is adequate. + +--- + +## What the framework guarantees + +Plugin code (ingestion functions, transforms, processes) can rely on the following being handled automatically by the framework: + +| Concern | Where handled | +| ----------------------------------------------------- | ------------------------------------------- | +| Coordinate name normalisation (`lat` → `y`, etc.) | `build_dataset_zarr` | +| Reprojection to instance CRS | `reproject_to_instance_crs` | +| Zarr chunking (auto-sized from `extents.temporal.resolution`) | `_compute_time_space_chunks` | +| Multiscale pyramid generation (when dims > 2048×2048) | `build_dataset_zarr` | +| GeoZarr root attributes (`spatial:bbox`, `proj:code`) | `build_dataset_zarr` | +| Artifact coverage computation | `_coverage_from_dataset` | +| Artifact record persistence | `_store_artifact` | +| pygeoapi publication | `publish_artifact_record` if `publish=true` | +| STAC collection generation | Dynamic from artifact record | + +Plugin code only needs to produce data files. Everything else is the framework's responsibility. + +--- + +## Consequences of design choices + +### Single extent per instance + +Each instance is configured for one place. This keeps the data model simple (no per-artifact extent tags) and the zarr stores small (country-scale downloads rather than global). The trade-off is that a national ministry with sub-national data needs either runs multiple instances or configures a single instance at national extent. + +### Temporal gaps are not allowed + +The sync engine validates that new data connects to the end of the existing artifact before appending. If a gap exists, the sync fails rather than silently producing a dataset with a hole. This is a deliberate constraint: downstream consumers (DHIS2, CHAP) depend on continuous time series and should not receive data with silent gaps. + +### The append execution mode avoids re-downloading history + +`append` downloads only the missing range and rebuilds the full zarr from all cached files. This means the local cache (NetCDF files in `data/downloads/`) is the source of truth for the full time series; the zarr is a derived view. If the cache is deleted, a rematerialize is required to recover. + +### Transforms run after download, before reproject + +Transforms see raw downloaded values in the source CRS and source units. The order is: download → transform → reproject → write zarr. diff --git a/docs/zarr_and_geozarr.md b/docs/zarr_and_geozarr.md new file mode 100644 index 00000000..48ae3f52 --- /dev/null +++ b/docs/zarr_and_geozarr.md @@ -0,0 +1,147 @@ +# Zarr and GeoZarr + +This document explains why the Climate API uses Zarr as its primary storage format, how Zarr stores are structured and served, and how GeoZarr root attributes enable map rendering. + +--- + +## What is Zarr? + +[Zarr](https://zarr.dev) is an open storage format for chunked, compressed N-dimensional arrays. A Zarr store is a directory tree: array metadata lives in `zarr.json` files, and the data itself is split into independent chunk files. Each chunk is compressed independently and can be read in a single HTTP request. + +Zarr is designed to work natively in cloud object stores as well as on local disk — the directory layout is the same in both cases. The [Zarr v3 specification](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html) is the current standard. + +--- + +## What is GeoZarr? + +[GeoZarr](https://github.com/zarr-developers/geozarr-spec) is a draft convention that adds spatial context to Zarr stores. A plain Zarr array has no concept of geography — it is just numbers in a grid. GeoZarr defines a small set of root attributes (`spatial:bbox`, `proj:code`, `zarr_conventions`) that tell a client where the grid is located on Earth and in which coordinate reference system. + +--- + +## Why Zarr + +Climate datasets are large, multi-dimensional arrays: a daily precipitation dataset covering a country at 5 km resolution for 10 years has roughly 3 600 time steps and hundreds of thousands of spatial pixels. Serving this efficiently from a REST API requires a format that supports: + +- **Chunk-level random access** — a client requesting one time step should not have to read the entire file. Zarr stores data in independent, addressable chunks; a request for a single date reads only the relevant chunk. +- **HTTP-native serving** — each chunk is a separate file on disk. A standard `GET /zarr/{dataset_id}/{chunk_path}` serves it with a regular `FileResponse`. No specialised server software is needed. +- **Cloud compatibility** — the same directory layout works on local disk and cloud storage without code changes. +- **Multiscale pyramids** — GeoZarr defines a multiscales convention that allows a store to contain multiple resolution levels. Map clients request only the level that matches their current zoom, avoiding full-resolution downloads. + +--- + +## ARCO: Analysis-Ready, Cloud-Optimized + +The stores produced by the Climate API are an instance of the **ARCO** pattern — a term from the climate science community describing datasets that are simultaneously ready for analysis and optimised for cloud access. + +The two halves of the term map directly onto the choices described in this document: + +**Analysis-ready** means a consumer can open the data and start computing without preprocessing: + +- Dimension names are normalised to `(time, x, y)` regardless of the source convention. +- All datasets in an instance share a single coordinate reference system. +- Units are standardised by the transform pipeline (e.g. Kelvin → Celsius). + +**Cloud-optimized** means the data can be accessed efficiently over HTTP without downloading the whole file. The Zarr and GeoZarr formats provide all the necessary properties — chunk-level access, HTTP-native serving, multiscale pyramids, and cloud compatibility. + +The Climate API targets the same access pattern at country scale for arbitrary source datasets. + +--- + +## Store layout on disk + +Each managed dataset has exactly one Zarr store on disk, stored under `{data_dir}/downloads/{dataset_id}.zarr`. The store is either: + +- **Flat** — a single-resolution Zarr store with dimensions `(time, x, y)` +- **Pyramid** — a multi-resolution Zarr store with levels `0/`, `1/`, `2/`, … where `0/` is the full resolution + +The flat vs. pyramid decision is made at build time based on spatial size (see [Multiscale pyramids](#multiscale-pyramids) below). + +--- + +## Chunk sizing + +Chunks are sized to match expected access patterns. The goal is that reading one time step for the full spatial extent fits in one round-trip, and that full time series for a small area also fits in one round-trip. + +Time chunk sizes are derived from the dataset's `extents.temporal.resolution` field, an ISO 8601 duration (e.g. `P1D`, `PT1H`, `P1M`). When present and valid, the duration is converted to approximate hours and mapped to a natural analysis window: + +| Duration tier | Approximate hours | Target window | Example | +| ------------------- | ----------------- | ------------- | --------------------------- | +| Sub-daily | < 24 h | ~1 week | `PT1H` (hourly) → 168 steps | +| Daily to sub-weekly | 24 h – 168 h | ~1 month | `P1D` (daily) → 30 steps | +| Weekly and coarser | ≥ 168 h | ~1 year | `P1M` (monthly) → 12 steps | + +This calculation is fully data-driven: any dataset — including custom or plugin datasets — only needs to declare `extents.temporal.resolution` and the correct chunk size is computed automatically. If the field is absent or not a valid ISO 8601 duration, a warning is logged and the time chunk falls back to the dataset's `period_type`. + +Spatial chunks are capped at 512 × 512 pixels — a pragmatic compromise between tile rendering (which benefits from smaller chunks) and analysis workloads (which benefit from larger ones). For small extents where the full spatial dimension is smaller than 512 pixels, the entire dimension fits in one chunk. + +Dimension names are normalised to `(time, x, y)` before writing, regardless of the source naming convention (`lat`/`lon`, `latitude`/`longitude`, etc.). + +--- + +## Multiscale pyramids + +For large spatial extents, a flat zarr would require a map viewer to download the entire spatial extent at full resolution on every tile request. The platform builds a multiscale pyramid when the spatial dimensions exceed **2048 × 2048 pixels**. + +Pyramid levels are computed as: + +``` +levels = ceil(log2(max_dim / 512)) # clamped to [2, 8] +``` + +Where 512 is the target tile size in pixels. Each level halves the resolution in both spatial dimensions using mean downsampling. Level `0/` is always the full resolution. + +Both flat and pyramid stores are written in **Zarr v3** format. + +--- + +## GeoZarr root attributes + +A plain Zarr store has no concept of spatial coordinates. A map viewer opening it has no way to know where to position tiles on a map. GeoZarr addresses this by writing a small set of attributes into `zarr.json` at the store root: + +| Attribute | Example value | Purpose | +| ------------------ | ------------------------- | ------------------------------ | +| `spatial:bbox` | `[3.0, 57.0, 32.0, 72.5]` | Bounding box in the native CRS | +| `proj:code` | `EPSG:4326` | CRS of the stored coordinates | +| `zarr_conventions` | `[{...}]` | Convention declarations | + +These attributes are computed from the actual coordinate bounds of the written data and the instance CRS. They are always written by the framework after transforms and reprojection have run. This guarantees they always reflect the final stored data. + +`zarr_conventions` for a flat store contains the base GeoZarr convention declaration. For pyramid stores it also includes a multiscales entry that declares the level structure. + +--- + +## CRS handling + +The instance CRS is configured in `climate-api.yaml`: + +```yaml +extent: + bbox: [3.0, 57.0, 32.0, 72.5] + crs: EPSG:32633 # optional; defaults to EPSG:4326 +``` + +Datasets are always stored in the instance CRS. During ingestion, data is reprojected from its source CRS (declared as `source_crs` in the template, default `EPSG:4326`) to the instance CRS. The stored `spatial:bbox` is therefore in the instance CRS — UTM eastings and northings for a UTM instance, degrees for a WGS84 instance. + +STAC metadata also stores the WGS84 bounding box alongside the native bbox, so catalogue clients that expect geographic coordinates always get one regardless of the instance CRS. + +--- + +## How Zarr stores are served + +The `/zarr/{dataset_id}/` endpoint serves individual files from the Zarr store directory using FastAPI's `FileResponse`. The ZarrLayer client issues one HTTP request per chunk file it needs. + +``` +GET /zarr/{dataset_id}/zarr.json → root metadata (JSON) +GET /zarr/{dataset_id}/precip/c/0/0/0 → chunk at time=0, x=0, y=0 +GET /zarr/{dataset_id}/time/c/0 → time coordinate chunk +``` + +Metadata files (`zarr.json`) are returned as `application/json`. All other files — chunk data — are returned as `application/octet-stream`. Directory paths return a JSON listing of their contents. + +This design means the zarr store is served by ordinary file serving — there is no zarr-specific server middleware. + +--- + +## Fill values and NaN handling + +When writing float data to Zarr, missing data is stored as IEEE `NaN`. The map viewer uses the zarr `fill_value` attribute (which defaults to `NaN` for float arrays) to render missing pixels as transparent. diff --git a/mkdocs.yml b/mkdocs.yml index 151c3a5d..73c5c957 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -66,6 +66,8 @@ nav: - Home: index.md - Get started: setup_guide.md - Concepts: + - Architecture: architecture.md + - Zarr and GeoZarr: zarr_and_geozarr.md - Project overview: project_description.md - Implementation status: implementation-status.md - OGC API: ogcapi.md diff --git a/tests/test_stac.py b/tests/test_stac.py index f733ac88..057170f3 100644 --- a/tests/test_stac.py +++ b/tests/test_stac.py @@ -128,7 +128,12 @@ def test_collection_uses_xstac_and_adds_expected_fields(client: TestClient, monk monkeypatch.setattr( stac_services.registry_datasets, "get_dataset", - lambda _: {"period_type": "daily", "units": "mm", "source": "CHIRPS v3"}, + lambda _: { + "period_type": "daily", + "units": "mm", + "source": "CHIRPS v3", + "extents": {"temporal": {"resolution": "P1D"}}, + }, ) monkeypatch.setattr( stac_services, @@ -250,7 +255,12 @@ def test_collection_sets_hourly_step_to_pt1h(client: TestClient, monkeypatch: py monkeypatch.setattr( stac_services.registry_datasets, "get_dataset", - lambda _: {"period_type": "hourly", "source": "ERA5-Land", "short_name": "2m temperature"}, + lambda _: { + "period_type": "hourly", + "source": "ERA5-Land", + "short_name": "2m temperature", + "extents": {"temporal": {"resolution": "PT1H"}}, + }, ) monkeypatch.setattr( stac_services,