Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion changes/3924.bugfix.md

This file was deleted.

18 changes: 9 additions & 9 deletions src/zarr/storage/_fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
)
from zarr.core.buffer import Buffer
from zarr.errors import ZarrUserWarning
from zarr.storage._utils import _join_paths, normalize_path
from zarr.storage._utils import _dereference_path

if TYPE_CHECKING:
from collections.abc import AsyncIterator, Iterable
Expand Down Expand Up @@ -127,7 +127,7 @@ def __init__(
) -> None:
super().__init__(read_only=read_only)
self.fs = fs
self.path = normalize_path(path)
self.path = path
self.allowed_exceptions = allowed_exceptions

if not self.fs.async_impl:
Expand Down Expand Up @@ -282,7 +282,7 @@ async def get(
# docstring inherited
if not self._is_open:
await self._open()
path = _join_paths([self.path, key])
path = _dereference_path(self.path, key)

try:
if byte_range is None:
Expand Down Expand Up @@ -329,7 +329,7 @@ async def set(
raise TypeError(
f"FsspecStore.set(): `value` must be a Buffer instance. Got an instance of {type(value)} instead."
)
path = _join_paths([self.path, key])
path = _dereference_path(self.path, key)
# write data
if byte_range:
raise NotImplementedError
Expand All @@ -338,7 +338,7 @@ async def set(
async def delete(self, key: str) -> None:
# docstring inherited
self._check_writable()
path = _join_paths([self.path, key])
path = _dereference_path(self.path, key)
try:
await self.fs._rm(path)
except FileNotFoundError:
Expand All @@ -354,14 +354,14 @@ async def delete_dir(self, prefix: str) -> None:
)
self._check_writable()

path_to_delete = _join_paths([self.path, prefix])
path_to_delete = _dereference_path(self.path, prefix)

with suppress(*self.allowed_exceptions):
await self.fs._rm(path_to_delete, recursive=True)

async def exists(self, key: str) -> bool:
# docstring inherited
path = _join_paths([self.path, key])
path = _dereference_path(self.path, key)
exists: bool = await self.fs._exists(path)
return exists

Expand All @@ -378,7 +378,7 @@ async def get_partial_values(
starts: list[int | None] = []
stops: list[int | None] = []
for key, byte_range in key_ranges:
paths.append(_join_paths([self.path, key]))
paths.append(_dereference_path(self.path, key))
if byte_range is None:
starts.append(None)
stops.append(None)
Expand Down Expand Up @@ -429,7 +429,7 @@ async def list_prefix(self, prefix: str) -> AsyncIterator[str]:
yield onefile.removeprefix(f"{self.path}/")

async def getsize(self, key: str) -> int:
path = _join_paths([self.path, key])
path = _dereference_path(self.path, key)
info = await self.fs._info(path)

size = info.get("size")
Expand Down
47 changes: 47 additions & 0 deletions src/zarr/storage/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,53 @@ def _join_paths(paths: Iterable[str]) -> str:
return "/".join(filter(lambda v: v != "", paths))


def _dereference_path(root: str, path: str) -> str:
"""
Combine a store-side root with a key into a single fully-qualified path.

Unlike `_join_paths`, this is purpose-built for the case where `root` is
an opaque backend-side prefix that may use `"/"` as a sentinel for "root
of the filesystem" (notably for fsspec's `ReferenceFileSystem`). A
trailing `"/"` is stripped from `root` before joining; if `root` is then
empty, the bare `path` is returned so that joining `"/"` with `"key"`
yields `"key"` rather than `"//key"`. A trailing `"/"` on the result is
also stripped.

Leading slashes on `root` are preserved -- a backend-side path like
`"/home/foo/data.zarr"` is an absolute filesystem path for
`LocalFileSystem` and must not lose its leading separator.

Parameters
----------
root : str
The backend-side root of a store. May be `""`, `"/"`, an absolute
filesystem path, or a backend-specific prefix.
path : str
The key within the store, typically a zarr key like `"zarr.json"`
or `"a/b/c/zarr.json"`.

Returns
-------
str
`root` and `path` joined by a single `"/"`, with the `"/"` sentinel
collapsed and trailing slashes removed.

Examples
--------
```python
from zarr.storage._utils import _dereference_path
_dereference_path("/", "zarr.json") # 'zarr.json'
_dereference_path("", "zarr.json") # 'zarr.json'
_dereference_path("/home/foo", "zarr.json") # '/home/foo/zarr.json'
_dereference_path("/home/foo/", "zarr.json") # '/home/foo/zarr.json'
_dereference_path("bucket/p", "zarr.json") # 'bucket/p/zarr.json'
```
"""
root = root.rstrip("/")
path = f"{root}/{path}" if root else path
return path.rstrip("/")


def _relativize_path(*, path: str, prefix: str) -> str:
"""
Make a "/"-delimited path relative to some prefix. If the prefix is '', then the path is
Expand Down
124 changes: 106 additions & 18 deletions tests/test_store/test_fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from zarr.errors import ZarrUserWarning
from zarr.storage import FsspecStore
from zarr.storage._fsspec import _make_async
from zarr.storage._utils import normalize_path
from zarr.testing.store import StoreTests

if TYPE_CHECKING:
Expand Down Expand Up @@ -287,25 +286,114 @@ def array_roundtrip(store: FsspecStore) -> None:
np.testing.assert_array_equal(arr[:], data)


@pytest.mark.skipif(
parse_version(fsspec.__version__) < parse_version("2024.12.0"),
reason="No AsyncFileSystemWrapper",
@pytest.mark.parametrize(
("root", "key", "expected"),
[
# `"/"` as root collapses so that bare-key backends (notably
# ReferenceFileSystem) get the right key. Regression test for
# https://github.com/zarr-developers/zarr-python/issues/3922 .
("/", "zarr.json", "zarr.json"),
("", "zarr.json", "zarr.json"),
# Trailing slashes on the root are stripped before joining.
("foo/", "zarr.json", "foo/zarr.json"),
("foo", "zarr.json", "foo/zarr.json"),
# Leading slashes on the root are preserved -- absolute filesystem
# paths must stay absolute. Regression test for the titiler-xarray
# breakage that #3924 introduced when `normalize_path` was applied to
# `FsspecStore.path`.
("/home/runner/data.zarr", "zarr.json", "/home/runner/data.zarr/zarr.json"),
("/home/runner/data.zarr/", "zarr.json", "/home/runner/data.zarr/zarr.json"),
# Multi-segment keys.
("/home/foo", "a/b/zarr.json", "/home/foo/a/b/zarr.json"),
("", "a/b/zarr.json", "a/b/zarr.json"),
# Trailing slash on the result is stripped (relevant when key is "").
("/home/foo", "", "/home/foo"),
],
)
@pytest.mark.parametrize("path", ["", "/", "//", "foo", "foo/", "/foo", "/foo/", "//foo//"])
def test_fsspec_store_path_normalization(path: str) -> None:
"""`FsspecStore.path` is normalized to the canonical form, matching
`normalize_path`, regardless of the surface representation the caller
supplies.

Regression test for https://github.com/zarr-developers/zarr-python/issues/3922
-- when a caller passed `path="/"` the leading slash flowed through
unmodified to subsequent `_join_paths([self.path, key])` calls, producing
`"//key"` and missing the underlying object.
def test_dereference_path(root: str, key: str, expected: str) -> None:
"""Verify the contract `_dereference_path` provides for `FsspecStore`.

`FsspecStore.path` is stored verbatim; the join with a key must collapse a
sentinel `"/"` root, strip trailing slashes, and preserve leading
slashes on absolute paths.
"""
from zarr.storage._utils import _dereference_path

assert _dereference_path(root, key) == expected


async def test_fsspec_store_open_group_via_reference_filesystem() -> None:
"""End-to-end regression test for
https://github.com/zarr-developers/zarr-python/issues/3922 .

``ReferenceFileSystem`` keys its refs by bare strings like ``"zarr.json"``.
The bug was that ``FsspecStore(fs=ref_fs, path="/")`` produced
``"//zarr.json"`` at the join site and failed to find the entry, raising
``GroupNotFoundError``. This test pins ``path="/"`` explicitly to keep
coverage even if the default value changes later.
"""
import json

from fsspec.implementations.reference import ReferenceFileSystem

group_json = json.dumps({"zarr_format": 3, "node_type": "group", "attributes": {}})
fs = ReferenceFileSystem(
fo={"version": 1, "refs": {"zarr.json": group_json}},
asynchronous=True,
)
store = FsspecStore(fs=fs, path="/", read_only=True)
group = await zarr.api.asynchronous.open_group(store, mode="r")
assert group.metadata.zarr_format == 3


async def test_fsspec_store_read_array_chunk_via_reference_filesystem() -> None:
"""End-to-end regression test that exercises the byte-range read path
against ``ReferenceFileSystem``.

Beyond opening a group (covered by
``test_fsspec_store_open_group_via_reference_filesystem``), this test
constructs a small zarr v3 array whose chunk lives in the refs dict and
reads it through the store. Path-handling bugs on the byte-range
fetch path (used by kerchunk-style virtualization) would surface here
rather than at metadata-open time.
"""
sync_fs = fsspec.filesystem("memory")
fs = _make_async(sync_fs)
store = FsspecStore(fs=fs, path=path)
assert store.path == normalize_path(path)
import json

import numpy as np
from fsspec.implementations.reference import ReferenceFileSystem

# Construct a minimal v3 zarr: a single 1-D uint8 array of length 4 with
# one chunk of size 4. The chunk bytes are little-endian uint8s 1..4.
array_meta = json.dumps(
{
"zarr_format": 3,
"node_type": "array",
"shape": [4],
"chunk_grid": {"name": "regular", "configuration": {"chunk_shape": [4]}},
"data_type": "uint8",
"chunk_key_encoding": {"name": "default", "configuration": {"separator": "/"}},
"fill_value": 0,
"codecs": [{"name": "bytes", "configuration": {"endian": "little"}}],
"attributes": {},
}
)
chunk_bytes = bytes([1, 2, 3, 4])

refs: dict[str, str] = {
"zarr.json": array_meta,
# ReferenceFileSystem accepts raw bytes via base64 encoding or
# latin-1-decoded strings; latin-1 round-trips bytes 1:1.
"c/0": chunk_bytes.decode("latin-1"),
}

fs = ReferenceFileSystem(
fo={"version": 1, "refs": refs},
asynchronous=True,
)
store = FsspecStore(fs=fs, path="/", read_only=True)
array = await zarr.api.asynchronous.open_array(store=store, mode="r")
data = await array.getitem(slice(None))
np.testing.assert_array_equal(data, np.array([1, 2, 3, 4], dtype="uint8"))


@pytest.mark.skipif(
Expand Down
Loading