From b69917e9dae8ee0ba614c6ea01768d944572568c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 17 Jun 2025 10:36:30 +0200 Subject: [PATCH 1/5] (feat): use numpy nan-able string type if possible --- src/anndata/_io/specs/lazy_methods.py | 13 ++++++++++--- src/anndata/_io/specs/methods.py | 4 ++-- src/anndata/compat/__init__.py | 7 +++++++ src/anndata/experimental/backed/_lazy_arrays.py | 15 +++++++++++---- 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 40a38018c..bd26d5ac6 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -14,7 +14,15 @@ from anndata._core.file_backing import filename, get_elem_name from anndata._core.xarray import Dataset2D, requires_xarray from anndata.abc import CSCDataset, CSRDataset -from anndata.compat import DaskArray, H5Array, H5Group, XDataArray, ZarrArray, ZarrGroup +from anndata.compat import ( + NULLABLE_NUMPY_STRING_TYPE, + DaskArray, + H5Array, + H5Group, + XDataArray, + ZarrArray, + ZarrGroup, +) from .registry import _LAZY_REGISTRY, IOSpec @@ -251,8 +259,7 @@ def _gen_xarray_dict_iterator_from_elems( "base_path_or_zarr_group": v.base_path_or_zarr_group, "elem_name": v.elem_name, "is_nullable_string": isinstance(v, MaskedArray) - and v.dtype # CategoricalArray dtype access requires a read nad is not necessary here - == np.dtype("O"), + and v.dtype == NULLABLE_NUMPY_STRING_TYPE, }, ) elif k == dim_name: diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index b934d37f3..f485c8800 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -41,7 +41,7 @@ ) from ..._settings import settings -from ...compat import is_zarr_v2 +from ...compat import NULLABLE_NUMPY_STRING_TYPE, is_zarr_v2 from .registry import _REGISTRY, IOSpec, read_elem, read_elem_partial if TYPE_CHECKING: @@ -1210,7 +1210,7 @@ def _string_array( values: np.ndarray, mask: np.ndarray ) -> pd.api.extensions.ExtensionArray: """Construct a string array from values and mask.""" - arr = pd.array(values, dtype=pd.StringDtype()) + arr = pd.array(values.astype(NULLABLE_NUMPY_STRING_TYPE), dtype=pd.StringDtype()) arr[mask] = pd.NA return arr diff --git a/src/anndata/compat/__init__.py b/src/anndata/compat/__init__.py index ba0835607..b8c63e4f2 100644 --- a/src/anndata/compat/__init__.py +++ b/src/anndata/compat/__init__.py @@ -404,3 +404,10 @@ def _map_cat_to_str(cat: pd.Categorical) -> pd.Categorical: return cat.map(str, na_action="ignore") else: return cat.map(str) + + +NULLABLE_NUMPY_STRING_TYPE = ( + np.dtype("O") + if Version(np.__version__) < Version("2") + else np.dtypes.StringDType(na_object=pd.NA) +) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 5afe4dc84..7aafa17bd 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -3,7 +3,6 @@ from functools import cached_property from typing import TYPE_CHECKING, Generic, TypeVar -import numpy as np import pandas as pd from anndata._core.index import _subset @@ -12,13 +11,20 @@ from anndata.compat import H5Array, ZarrArray from ..._settings import settings -from ...compat import XBackendArray, XDataArray, XZarrArrayWrapper +from ...compat import ( + NULLABLE_NUMPY_STRING_TYPE, + XBackendArray, + XDataArray, + XZarrArrayWrapper, +) from ...compat import xarray as xr if TYPE_CHECKING: from pathlib import Path from typing import Literal + import numpy as np + from anndata._core.index import Index from anndata.compat import ZarrGroup @@ -146,7 +152,8 @@ def __getitem__( extension_array = pd.arrays.BooleanArray(values, mask=mask) elif self._dtype_str == "nullable-string-array": # https://github.com/pydata/xarray/issues/10419 - values[mask] = np.nan + values = values.astype(self.dtype) + values[mask] = pd.NA return values else: msg = f"Invalid dtype_str {self._dtype_str}" @@ -164,7 +171,7 @@ def dtype(self): return pd.BooleanDtype() elif self._dtype_str == "nullable-string-array": # https://github.com/pydata/xarray/issues/10419 - return np.dtype("O") + return NULLABLE_NUMPY_STRING_TYPE msg = f"Invalid dtype_str {self._dtype_str}" raise RuntimeError(msg) From 0cbfc3e34f7d8dd4b9064ef5b67cd7a52c2c649a Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Tue, 17 Jun 2025 15:28:41 +0200 Subject: [PATCH 2/5] (chore): relnote --- docs/release-notes/2011.feature.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/release-notes/2011.feature.md diff --git a/docs/release-notes/2011.feature.md b/docs/release-notes/2011.feature.md new file mode 100644 index 000000000..19256ef4a --- /dev/null +++ b/docs/release-notes/2011.feature.md @@ -0,0 +1 @@ +Use {attr}`numpy.dtypes.StringDType` with `na_object` set to {attr}`pandas.NA` for nullable string data as the backing data to the {class}`pandas.arrays.StringArray` {user}`ilan-gold` From 2136841f29a647946d8161f6919d3243b866857b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 17 Jun 2025 16:20:32 +0200 Subject: [PATCH 3/5] (fix): no need to cast in `_string_array` --- src/anndata/_io/specs/methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index f485c8800..b934d37f3 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -41,7 +41,7 @@ ) from ..._settings import settings -from ...compat import NULLABLE_NUMPY_STRING_TYPE, is_zarr_v2 +from ...compat import is_zarr_v2 from .registry import _REGISTRY, IOSpec, read_elem, read_elem_partial if TYPE_CHECKING: @@ -1210,7 +1210,7 @@ def _string_array( values: np.ndarray, mask: np.ndarray ) -> pd.api.extensions.ExtensionArray: """Construct a string array from values and mask.""" - arr = pd.array(values.astype(NULLABLE_NUMPY_STRING_TYPE), dtype=pd.StringDtype()) + arr = pd.array(values, dtype=pd.StringDtype()) arr[mask] = pd.NA return arr From 2efed2df497447166ed996268d07eb64238c15a0 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Wed, 18 Jun 2025 12:19:29 +0200 Subject: [PATCH 4/5] (chore): update relnote --- docs/release-notes/2011.feature.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-notes/2011.feature.md b/docs/release-notes/2011.feature.md index 19256ef4a..e686484c0 100644 --- a/docs/release-notes/2011.feature.md +++ b/docs/release-notes/2011.feature.md @@ -1 +1 @@ -Use {attr}`numpy.dtypes.StringDType` with `na_object` set to {attr}`pandas.NA` for nullable string data as the backing data to the {class}`pandas.arrays.StringArray` {user}`ilan-gold` +Use {attr}`numpy.dtypes.StringDType` with `na_object` set to {attr}`pandas.NA` for nullable string data with {class}`anndata.experimental.Dataset2D` {user}`ilan-gold` From e368910546d7b6e7bea54f32eac39c797d0355a7 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Wed, 18 Jun 2025 12:25:18 +0200 Subject: [PATCH 5/5] (fix): relnote link --- docs/release-notes/2011.feature.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-notes/2011.feature.md b/docs/release-notes/2011.feature.md index e686484c0..c34c66469 100644 --- a/docs/release-notes/2011.feature.md +++ b/docs/release-notes/2011.feature.md @@ -1 +1 @@ -Use {attr}`numpy.dtypes.StringDType` with `na_object` set to {attr}`pandas.NA` for nullable string data with {class}`anndata.experimental.Dataset2D` {user}`ilan-gold` +Use {attr}`numpy.dtypes.StringDType` with `na_object` set to {attr}`pandas.NA` for nullable string data with {class}`anndata.experimental.backed.Dataset2D` {user}`ilan-gold`