Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
578492a
docs: add architecture overview — concepts, lifecycle, sync kinds, pl…
turban May 13, 2026
9a80df4
docs: scope architecture doc to main — remove derived/remote sync kinds
turban May 13, 2026
ff7949e
Minor fixes
turban May 13, 2026
8cb6f6f
docs: fix template and artifact path descriptions (package path, conf…
turban May 13, 2026
d4d42c9
docs: rename download function to ingestion function, clarify dual-en…
turban May 13, 2026
0cff32d
docs: add Zarr and GeoZarr concepts page
turban May 13, 2026
e923ceb
fix: increase spatial chunk size to 512x512, add dekadal/weekly perio…
turban May 13, 2026
551905a
refactor: derive zarr time chunk size from ISO 8601 duration step
turban May 13, 2026
f4ad760
refactor: read ISO 8601 step from template period_step field before l…
turban May 13, 2026
56c4049
fix: resolve_iso_step reads extents.temporal.resolution first
turban May 13, 2026
9b37adf
breaking: require extents.temporal.resolution for ISO step resolution
turban May 13, 2026
795ff49
fix: write flat zarr stores in Zarr v3 format
turban May 13, 2026
d3d9b19
docs: update zarr doc to reflect unified v3 format
turban May 13, 2026
ef811c0
docs: align zarr doc with ISO 8601 chunk sizing and v3-only format
turban May 13, 2026
a183711
fix: update missing-resolution warning to reference extents.temporal.…
turban May 13, 2026
cda9fd0
refactor: rename resolve_iso_step to resolve_iso_period_step
turban May 13, 2026
47858b9
fix: guard zero-duration, catch ValueError in chunk sizing, fix test …
turban May 13, 2026
1496b07
docs: add What is Zarr and What is GeoZarr introductory sections with…
turban May 13, 2026
da23be5
docs: add period label to chunk sizing table examples
turban May 13, 2026
68df140
docs: use EPSG:4326 example values in GeoZarr attributes table
turban May 13, 2026
096fd1e
Text improvements
turban May 13, 2026
8256d77
docs: add ARCO section to zarr_and_geozarr.md
turban May 14, 2026
1e448a1
Text changes
turban May 14, 2026
2c6e060
Text changes
turban May 14, 2026
ba5cdf7
Text changes
turban May 14, 2026
e9a72ca
fix: validate extents.temporal.resolution in resolve_iso_period_step
turban May 15, 2026
a2efc18
Merge pull request #129 from dhis2/docs/zarr-and-geozarr
turban May 18, 2026
2a2c5ef
Merge branch 'main' into docs/platform-concepts
turban May 18, 2026
724c3c4
fix: move numpy/pandas imports to top of time.py, wrap long test lines
turban May 18, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 21 additions & 15 deletions climate_api/data_manager/services/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from topozarr.coarsen import create_pyramid

from climate_api import config as api_config
from climate_api.shared.time import resolve_iso_period_step, time_chunk_for_iso_step
from climate_api.transforms.reproject import reproject_to_instance_crs

from .utils import get_time_dim, get_x_y_dims
Expand Down Expand Up @@ -200,11 +201,7 @@ def build_dataset_zarr(dataset: dict[str, Any], *, start: str | None = None, end

else:
logger.info("Building flat zarr (max dim %d pixels)", max(ds.sizes[x_dim], ds.sizes[y_dim]))
# determine optimal chunk sizes
ds_autochunk = ds.chunk("auto").unify_chunks()
uniform_chunks: dict[str, Any] = {str(dim): ds_autochunk.chunks[dim][0] for dim in ds_autochunk.dims}
time_space_chunks = _compute_time_space_chunks(ds, dataset)
uniform_chunks.update(time_space_chunks)
uniform_chunks = _compute_time_space_chunks(ds, dataset)
logger.info(f"--> {uniform_chunks}")

ds.attrs.update(geozarr_attrs)
Expand All @@ -215,7 +212,7 @@ def build_dataset_zarr(dataset: dict[str, Any], *, start: str | None = None, end
# render missing pixels as transparent — not a separately specified fillValue.
for var in ds_chunked.data_vars:
ds_chunked[var].encoding.pop("_FillValue", None)
ds_chunked.to_zarr(zarr_path, mode="w", consolidated=True)
ds_chunked.to_zarr(zarr_path, mode="w", zarr_format=3, consolidated=True)
ds_chunked.close()

ds.close()
Expand Down Expand Up @@ -292,21 +289,30 @@ def _run_transforms(ds: xr.Dataset, dataset: dict[str, Any]) -> xr.Dataset:
def _compute_time_space_chunks(
ds: xr.Dataset,
dataset: dict[str, Any],
max_spatial_chunk: int = 256,
max_spatial_chunk: int = 512,
) -> dict[str, int]:
"""Compute chunk sizes tuned for common temporal access patterns."""
chunks: dict[str, int] = {}

iso_step = resolve_iso_period_step(dataset)
dim = get_time_dim(ds)
period_type = dataset["period_type"]
if period_type == "hourly":
chunks[dim] = 24 * 7
elif period_type == "daily":
chunks[dim] = 30
elif period_type == "monthly":
if iso_step is not None:
try:
chunks[dim] = time_chunk_for_iso_step(iso_step)
except ValueError:
logger.warning(
"Invalid ISO 8601 step %r for dataset '%s'; defaulting time chunk to 12.",
iso_step,
dataset.get("id", "?"),
)
chunks[dim] = 12
else:
logger.warning(
"No ISO 8601 step for dataset '%s'; defaulting time chunk to 12. "
"Declare 'extents.temporal.resolution' in the template to silence this warning.",
dataset.get("id", "?"),
)
chunks[dim] = 12
elif period_type == "yearly":
chunks[dim] = 1

x_dim, y_dim = get_x_y_dims(ds)
chunks[x_dim] = min(ds.sizes[x_dim], max_spatial_chunk)
Expand Down
63 changes: 63 additions & 0 deletions climate_api/shared/time.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,75 @@
"""Time helpers shared across Climate API modules."""

import logging
import re
from datetime import UTC, date, datetime
from typing import Any, cast

import numpy as np
import pandas as pd

logger = logging.getLogger(__name__)

_ISO_DURATION_RE = re.compile(r"^P(?:(\d+)Y)?(?:(\d+)M)?(?:(\d+)W)?(?:(\d+)D)?(?:T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?)?$")


def resolve_iso_period_step(dataset: dict[str, Any]) -> str | None:
"""Return the ISO 8601 duration step from ``extents.temporal.resolution``.

Returns None if the field is absent or not a valid ISO 8601 duration, logging
a warning in the latter case.
"""
extents = dataset.get("extents")
if not isinstance(extents, dict):
return None
temporal = extents.get("temporal")
if not isinstance(temporal, dict):
return None
resolution = temporal.get("resolution")
if not resolution:
return None
resolution_str = str(resolution)
try:
_iso_step_to_approx_hours(resolution_str)
except ValueError:
logger.warning("Invalid ISO 8601 duration in extents.temporal.resolution: %r", resolution_str)
return None
return resolution_str


def _iso_step_to_approx_hours(step: str) -> float:
"""Return the approximate duration in hours for an ISO 8601 duration string.

Months and years use calendar averages (30.4375 days/month, 365.25 days/year).
Raises ValueError for unrecognised formats.
"""
m = _ISO_DURATION_RE.fullmatch(step)
if not m:
raise ValueError(f"Cannot parse ISO 8601 duration: '{step}'")
years, months, weeks, days, hours, minutes, seconds = (int(g or 0) for g in m.groups())
result = (
years * 365.25 * 24 + months * 30.4375 * 24 + weeks * 7 * 24 + days * 24 + hours + minutes / 60 + seconds / 3600
)
if result <= 0:
raise ValueError(f"ISO 8601 duration '{step}' resolves to zero — cannot derive chunk size")
return result


def time_chunk_for_iso_step(step: str) -> int:
"""Return a suitable zarr time chunk size for a given ISO 8601 duration step.

Targets roughly one week of data for sub-daily steps, one month for daily/sub-weekly
steps, and one year for weekly and coarser steps. This keeps individual chunk files
at a manageable size while covering a natural analysis window in one read.
"""
hours = _iso_step_to_approx_hours(step)
if hours < 24:
return max(1, round(24 * 7 / hours)) # ~1 week
if hours < 24 * 7:
return max(1, round(24 * 30 / hours)) # ~1 month
return max(1, round(24 * 365.25 / hours)) # ~1 year


_WEEKLY_PERIOD_PATTERN = re.compile(r"^(?P<year>\d{4})-W(?P<week>\d{2})$")


Expand Down
16 changes: 2 additions & 14 deletions climate_api/stac/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from climate_api.data_registry.services import datasets as registry_datasets
from climate_api.ingestions import services as ingestion_services
from climate_api.ingestions.schemas import ArtifactFormat, ArtifactRecord, PublicationStatus
from climate_api.shared.time import parse_period_string_to_datetime
from climate_api.shared.time import parse_period_string_to_datetime, resolve_iso_period_step

CATALOG_ID = "climate-api"
CATALOG_TITLE = "DHIS2 Climate API"
Expand Down Expand Up @@ -118,7 +118,7 @@ def build_collection(dataset_id: str, request: Request) -> dict[str, object]:
collection_payload["license"] = template.license
_remove_helper_variables(collection_payload)
_round_spatial_steps(collection_payload)
_override_time_step(collection_payload, _period_step(source_dataset.get("period_type")))
_override_time_step(collection_payload, resolve_iso_period_step(source_dataset))
_override_spatial_extent_from_artifact(collection_payload, artifact)
_override_temporal_extent_from_artifact(collection_payload, artifact)
_sanitize_variable_attrs(collection_payload)
Expand Down Expand Up @@ -318,18 +318,6 @@ def _abs_url(request: Request, path: str) -> str:
return f"{str(request.base_url).rstrip('/')}{path}"


def _period_step(period_type: object) -> str | None:
if period_type == "hourly":
return "PT1H"
if period_type == "daily":
return "P1D"
if period_type == "monthly":
return "P1M"
if period_type == "yearly":
return "P1Y"
return None


def _override_time_step(collection: dict[str, Any], step: str | None) -> None:
if step is None:
return
Expand Down
Loading
Loading