diff --git a/docs/release-notes/2011.feature.md b/docs/release-notes/2011.feature.md new file mode 100644 index 000000000..c34c66469 --- /dev/null +++ b/docs/release-notes/2011.feature.md @@ -0,0 +1 @@ +Use {attr}`numpy.dtypes.StringDType` with `na_object` set to {attr}`pandas.NA` for nullable string data with {class}`anndata.experimental.backed.Dataset2D` {user}`ilan-gold` diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 44f03d827..99f72416d 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -15,6 +15,7 @@ from anndata._core.xarray import Dataset2D, requires_xarray from anndata.abc import CSCDataset, CSRDataset from anndata.compat import ( + NULLABLE_NUMPY_STRING_TYPE, DaskArray, H5Array, H5Group, @@ -259,8 +260,7 @@ def _gen_xarray_dict_iterator_from_elems( "base_path_or_zarr_group": v.base_path_or_zarr_group, "elem_name": v.elem_name, "is_nullable_string": isinstance(v, MaskedArray) - and v.dtype # CategoricalArray dtype access requires a read nad is not necessary here - == np.dtype("O"), + and v.dtype == NULLABLE_NUMPY_STRING_TYPE, }, ) elif k == dim_name: diff --git a/src/anndata/compat/__init__.py b/src/anndata/compat/__init__.py index ba0835607..b8c63e4f2 100644 --- a/src/anndata/compat/__init__.py +++ b/src/anndata/compat/__init__.py @@ -404,3 +404,10 @@ def _map_cat_to_str(cat: pd.Categorical) -> pd.Categorical: return cat.map(str, na_action="ignore") else: return cat.map(str) + + +NULLABLE_NUMPY_STRING_TYPE = ( + np.dtype("O") + if Version(np.__version__) < Version("2") + else np.dtypes.StringDType(na_object=pd.NA) +) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 5afe4dc84..7aafa17bd 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -3,7 +3,6 @@ from functools import cached_property from typing import TYPE_CHECKING, Generic, TypeVar -import numpy as np import pandas as pd from anndata._core.index import _subset @@ -12,13 +11,20 @@ from anndata.compat import H5Array, ZarrArray from ..._settings import settings -from ...compat import XBackendArray, XDataArray, XZarrArrayWrapper +from ...compat import ( + NULLABLE_NUMPY_STRING_TYPE, + XBackendArray, + XDataArray, + XZarrArrayWrapper, +) from ...compat import xarray as xr if TYPE_CHECKING: from pathlib import Path from typing import Literal + import numpy as np + from anndata._core.index import Index from anndata.compat import ZarrGroup @@ -146,7 +152,8 @@ def __getitem__( extension_array = pd.arrays.BooleanArray(values, mask=mask) elif self._dtype_str == "nullable-string-array": # https://github.com/pydata/xarray/issues/10419 - values[mask] = np.nan + values = values.astype(self.dtype) + values[mask] = pd.NA return values else: msg = f"Invalid dtype_str {self._dtype_str}" @@ -164,7 +171,7 @@ def dtype(self): return pd.BooleanDtype() elif self._dtype_str == "nullable-string-array": # https://github.com/pydata/xarray/issues/10419 - return np.dtype("O") + return NULLABLE_NUMPY_STRING_TYPE msg = f"Invalid dtype_str {self._dtype_str}" raise RuntimeError(msg)