diff --git a/cf/data/QUESTIONS.rst b/cf/data/QUESTIONS.rst new file mode 100644 index 0000000000..9014430d4e --- /dev/null +++ b/cf/data/QUESTIONS.rst @@ -0,0 +1,25 @@ +Questions and answers +===================== + +A place to record random thoughts about the daskification of +`cf.Data`, possibly prior to starting an issue on GitHub. + +---- + +Q. When we run something that executes all of the lazy operations + (like `cf.Data.is_masked`), should/could we replace the dask array + with a "persisted" version of the computed data? If we did this, we + would want to have the ability to cache persisted chunks to disk, + as they came into being on each thread (see, for instance, + `chest`). To do this or not do this could be controlled by a + configuation setting. + +A. ? + +---- + +Q. + +A. ? + +---- diff --git a/cf/data/README.rst b/cf/data/README.rst new file mode 100644 index 0000000000..6e404b720e --- /dev/null +++ b/cf/data/README.rst @@ -0,0 +1,24 @@ +`cf.Data` developer notes +========================= + +Hardness of the mask +-------------------- + +Any `cf.Data` method that changes the dask array should consider +whether or not the mask hardness needs resetting before +returning. This will be necessary if there is the possibility that the +operation being applied to the dask array could lose the "memory" on +its chunks of whether or not the mask is hard. + +A common situation that causes a chunk to lose its memory of whether +or not the mask is hard is when a chunk could have contained a +unmasked `numpy` array prior to the operation, but the operation could +convert it to a masked `numpy` array. The new masked array will always +have the `numpy` default hardness (i.e. soft), which may be +incorrect. + +The mask hardness is most easily reset with the +`cf.Data._reset_mask_hardness` method. + +`cf.Data.__setitem__` and `cf.Data.where` are examples of methods that +need to reset the mask in this manner. diff --git a/cf/data/data.py b/cf/data/data.py index edea334fe9..6839572a6a 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1,9 +1,11 @@ import logging +import math import operator from functools import partial, reduce, wraps from itertools import product from json import dumps as json_dumps from json import loads as json_loads +from numbers import Integral from operator import mul try: @@ -104,6 +106,7 @@ from .partition import Partition from .partitionmatrix import PartitionMatrix from .utils import ( # is_small,; is_very_small, + conform_units, convert_to_datetime, convert_to_reftime, dask_compatible, @@ -111,6 +114,11 @@ new_axis_identifier, ) +# from .chunk_utils import ( # is_small,; is_very_small, +# harden_mask_chunk, +# soften_mask_chunk, +# ) + # from dask.array import Array @@ -298,8 +306,7 @@ def __init__( array: optional The array of values. May be any scalar or array-like - object, including another `Data` instance. Ignored if the - *source* parameter is set. + object, including another `Data` instance. *Parameter example:* ``array=[34.6]`` @@ -312,8 +319,7 @@ def __init__( units: `str` or `Units`, optional The physical units of the data. if a `Units` object is - provided then this an also set the calendar. Ignored if - the *source* parameter is set. + provided then this an also set the calendar. The units (without the calendar) may also be set after initialisation with the `set_units` method. @@ -325,8 +331,7 @@ def __init__( ``units='days since 2018-12-01'`` calendar: `str`, optional - The calendar for reference time units. Ignored if the - *source* parameter is set. + The calendar for reference time units. The calendar may also be set after initialisation with the `set_calendar` method. @@ -338,8 +343,7 @@ def __init__( The fill value of the data. By default, or if set to `None`, the `numpy` fill value appropriate to the array's data-type will be used (see - `numpy.ma.default_fill_value`). Ignored if the *source* - parameter is set. + `numpy.ma.default_fill_value`). The fill value may also be set after initialisation with the `set_fill_value` method. @@ -349,7 +353,8 @@ def __init__( dtype: data-type, optional The desired data-type for the data. By default the - data-type will be inferred form the *array* parameter. + data-type will be inferred form the *array* + parameter. The data-type may also be set after initialisation with the `dtype` attribute. @@ -367,11 +372,12 @@ def __init__( mask: optional Apply this mask to the data given by the *array* - parameter. By default, or if *mask* is `None`, no mask is - applied. May be any scalar or array-like object (such as a - `list`, `numpy` array or `Data` instance) that is - broadcastable to the shape of *array*. Masking will be - carried out where the mask elements evaluate to `True`. + parameter. By default, or if *mask* is `None`, no mask + is applied. May be any scalar or array-like object + (such as a `list`, `numpy` array or `Data` instance) + that is broadcastable to the shape of *array*. Masking + will be carried out where the mask elements evaluate + to `True`. This mask will applied in addition to any mask already defined by the *array* parameter. @@ -379,8 +385,10 @@ def __init__( .. versionadded:: 3.0.5 source: optional - Initialize the array, units, calendar and fill value from - those of *source*. + Initialize the data values and metadata (such as + units, mask hardness, etc.) from the data of + *source*. All other arguments, with the exception of + *copy*, are ignored. hardmask: `bool`, optional If False then the mask is soft. By default the mask is @@ -458,7 +466,6 @@ def __init__( "Can't set the 'source' and 'loads' parameters " "at the same time" ) - # --- End: if if source is not None: super().__init__(source=source, _use_array=_use_array) @@ -472,6 +479,11 @@ def __init__( else: self._del_dask(None) + # Note the mask hardness. It is safe to assume that if a + # dask array has been set, then it's mask hardness will be + # already baked into each chunk. + self._hardmask = getattr(source, "hardmask", _DEFAULT_HARDMASK) + return super().__init__( @@ -481,7 +493,8 @@ def __init__( # Create the _HDF_chunks attribute: defines HDF chunking when # writing to disk. # - # Never change the _HDF_chunks attribute in-place. + # Never change the value of the _HDF_chunks attribute + # in-place. self._HDF_chunks = None if loadd is not None: @@ -496,7 +509,12 @@ def __init__( units = Units(units, calendar=calendar) self._Units = units - # Set the mask hardness + # Note the mask hardness. This only records what we want the + # mask hardness to be, and is required in case this + # initialization does not set an array (i.e. array is None or + # _use_array is False). If a dask array is actually set later + # on, then the mask hardness will be set properly, i.e. it + # will be baked into each chunk. self._hardmask = hardmask if array is None: @@ -513,7 +531,7 @@ def __init__( # is removed from _axes then it must also be removed from # _cyclic. # - # Never change the _cyclic attribute in-place. + # Never change the value of the _cyclic attribute in-place. self._cyclic = _empty_set # Create the _axes attribute: an ordered sequence of unique @@ -558,7 +576,6 @@ def __init__( first_value = first_non_missing_value(array) if first_value is not None: dt = hasattr(first_value, "timetuple") - # --- End: if # Convert string or object date-times to floating point # reference times @@ -568,7 +585,10 @@ def __init__( self._Units = units # Store the dask array - self._set_dask(array, delete_source=False) + self._set_dask(array, delete_source=False, reset_mask_hardness=False) + + # Set the mask hardness on each chunk. + self.hardmask = hardmask # Override the data type if dtype is not None: @@ -577,10 +597,6 @@ def __init__( # Apply a mask if mask is not None: self.where(mask, cf_masked, inplace=True) - else: - # Set the mask hardness (which is otherwise set by the - # 'where' method) - self._set_mask_hardness() @property def dask_array(self): @@ -620,17 +636,17 @@ def __contains__(self, value): **Performance** - All delayed operations are computed, and the operation does - not short-circuit once the first occurrence is found. + All delayed operations are exectued, and there is no + short-circuit once the first occurrence is found. **Examples:** - >>> d = Data([[0.0, 1, 2], [3, 4, 5]], 'm') + >>> d = cf.Data([[0.0, 1, 2], [3, 4, 5]], 'm') >>> 4 in d True - >>> Data(3) in d + >>> cf.Data(3) in d True - >>> Data([2.5], units='2 m') in d + >>> cf.Data([2.5], units='2 m') in d True >>> [[2]] in d True @@ -659,7 +675,7 @@ def contains_chunk(a, value): dx = self._get_dask() - out_ind = tuple(dx.ndim) + out_ind = tuple(range(dx.ndim)) dx_ind = out_ind dx = da.blockwise( @@ -918,7 +934,13 @@ def __getitem__(self, indices): (similar to the way vector subscripts work in Fortran). This is the same behaviour as indexing on a `netCDF4.Variable` object. - . seealso:: `__setitem__`, `_parse_indices` + **Performance** + + If the shape of the data is unknown then it is calculated + immediately by exectuting all delayed operations. + + . seealso:: `__setitem__`, `__keepdims_indexing__`, + `__orthogonal_indexing__` :Returns: @@ -946,8 +968,6 @@ def __getitem__(self, indices): if indices is Ellipsis: return self.copy() - d = self - auxiliary_mask = () try: arg = indices[0] @@ -958,53 +978,126 @@ def __getitem__(self, indices): auxiliary_mask = indices[1] indices = indices[2:] + shape = self.shape + keepdims = self.__keepdims_indexing__ + indices, roll = parse_indices( - d.shape, indices, cyclic=True, numpy_indexing=False + shape, + indices, + cyclic=True, + keepdims=keepdims, ) - # TODODASK - sort out the "numpy" environment - - # TODODASK - multiple list indices - - axes = d._axes - cyclic_axes = d._cyclic + axes = self._axes + cyclic_axes = self._cyclic + # ------------------------------------------------------------ + # Roll axes with cyclic slices + # ------------------------------------------------------------ if roll: - # Roll axes with cyclic slices. For example, if slice(-2, - # 3) has been requested on a cyclic axis (and we're not - # using numpy indexing), then we roll that axis by two - # points and apply the slice(0, 5) instead. - if cyclic_axes.intersection([axes[i] for i in roll]): + # For example, if slice(-2, 3) has been requested on a + # cyclic axis, then we roll that axis by two points and + # apply the slice(0, 5) instead. + if not cyclic_axes.issuperset([axes[i] for i in roll]): raise IndexError( "Can't take a cyclic slice of a non-cyclic axis" ) - d = d.roll(axis=tuple(roll.keys()), shift=tuple(roll.values())) - new = d + new = self.roll( + axis=tuple(roll.keys()), shift=tuple(roll.values()) + ) + dx = new._get_dask() else: - new = d.copy(array=False) + new = self.copy(array=False) + dx = self._get_dask() - # Get the subspaced dask array - dx = d._get_dask() - dx = dx[tuple(indices)] - new._set_dask(dx) + # ------------------------------------------------------------ + # Subspace the dask array + # ------------------------------------------------------------ + if self.__orthogonal_indexing__: + # Apply 'orthogonal indexing': indices that are 1-d arrays + # or lists subspace along each dimension + # independently. This behaviour is similar to Fortran, but + # different to dask. + axes_with_list_indices = [ + i + for i, x in enumerate(indices) + if isinstance(x, list) or getattr(x, "shape", False) + ] + n_axes_with_list_indices = len(axes_with_list_indices) - # Apply any auxiliary mask - for mask in auxiliary_mask: - new.where(mask, cf_masked, inplace=True) + if n_axes_with_list_indices < 2: + # At most one axis has a list/1-d array index so do a + # normal dask subspace + dx = dx[tuple(indices)] + else: + # At least two axes have list/1-d array indices so we + # can't do a normal dask subspace + + # Subspace axes which have list/1-d array indices + for axis in axes_with_list_indices: + dx = da.take(dx, indices[axis], axis=axis) + + if n_axes_with_list_indices < len(indices): + # Subspace axes which don't have list/1-d array + # indices. (Do this after subspacing axes which do + # have list/1-d array indices, in case + # __keepdims_indexing__ is False.) + slice_indices = [ + slice(None) if i in axes_with_list_indices else x + for i, x in enumerate(indices) + ] + dx = dx[tuple(slice_indices)] + else: + raise NotImplementedError( + "Non-orthogonal indexing has not yet been implemented" + ) + + # ------------------------------------------------------------ + # Set the subspaced dask array + # ------------------------------------------------------------ + new._set_dask(dx, reset_mask_hardness=False) - # Cyclic axes which have been reduced in size are no longer - # cyclic + # ------------------------------------------------------------ + # Get the axis identifiers for the subspace + # ------------------------------------------------------------ + shape0 = shape + if keepdims: + new_axes = axes + else: + new_axes = [ + axis + for axis, x in zip(axes, indices) + if not isinstance(x, Integral) and getattr(x, "shape", True) + ] + if new_axes != axes: + new._axes = new_axes + cyclic_axes = new._cyclic + if cyclic_axes: + shape0 = [ + n for n, axis in zip(shape, axes) if axis in new_axes + ] + + # ------------------------------------------------------------ + # Cyclic axes that have been reduced in size are no longer + # considered to be cyclic + # ------------------------------------------------------------ if cyclic_axes: x = [ axis - for axis, n0, n1 in zip(axes, d.shape, new.shape) - if n1 != n0 and axis in cyclic_axes + for axis, n0, n1 in zip(new_axes, shape0, new.shape) + if axis in cyclic_axes and n0 != n1 ] if x: - # Never change the _cyclic attribute in-place + # Never change the value of the _cyclic attribute + # in-place new._cyclic = cyclic_axes.difference(x) - # --- End: if + + # ------------------------------------------------------------ + # Apply auxiliary masks + # ------------------------------------------------------------ + for mask in auxiliary_mask: + new.where(mask, cf_masked, None, inplace=True) return new @@ -1020,6 +1113,12 @@ def __setitem__(self, indices, value): a subspace. See `__getitem__` for details on how to define subspace of the data array. + .. note:: Currently at most one dimension's assignment index + may be a 1-d array of integers or booleans. This is + is different to `__getitem__`, which applies + 'orthogonal indexing' when multiple indices of 1-d + array of integers or booleans are present. + **Missing data** The treatment of missing data elements during assignment to a @@ -1034,78 +1133,202 @@ def __setitem__(self, indices, value): the `cf.masked` constant or by assignment to a value which contains masked elements. - .. seealso:: `cf.masked`, `hardmask`, `where` + **Performance** + + If the shape of the data is unknown then it is calculated + immediately by executing all delayed operations. + + .. seealso:: `__getitem__`, `cf.masked`, `hardmask`, `where` **Examples:** """ - # TODODASK - sort out the "numpy" environment - indices, roll = parse_indices( - self.shape, indices, cyclic=True, numpy_indexing=False + self.shape, indices, cyclic=True, keepdims=True ) indices = tuple(indices) + axes_with_list_indices = [ + i + for i, x in enumerate(indices) + if isinstance(x, list) or getattr(x, "shape", False) + ] + if len(axes_with_list_indices) > 1: + raise NotImplementedError( + "Currently limited to at most one dimension's assignment " + "index being a 1-d array of integers or booleans. " + f"Got: {indices}" + ) + # TODODASK: The inherited algorithm that does assignment + # for multiple list/1-d array indices + # (cfdm.Data._set_subspace) won't work when the + # 1-d array is a dask array because it may need + # to be computed at __setitem__ runtime, which + # is not desirable. Until this can be fixed, + # it's easiest to disallow this case, that was + # allowed pre-dask. + + # Roll axes with cyclic slices if roll: - # Roll axes with cyclic slices. For example, if assigning - # to slice(-2, 3) has been requested on a cyclic axis (and - # we're not using numpy indexing), then we roll that axis - # by two points and assign to slice(0, 5) instead. The - # axis is then unrolled by two points afer the assignment - # has been made. + # For example, if assigning to slice(-2, 3) has been + # requested on a cyclic axis (and we're not using numpy + # indexing), then we roll that axis by two points and + # assign to slice(0, 5) instead. The axis is then unrolled + # by two points afer the assignment has been made. axes = self._axes - if self._cyclic.intersection([axes[i] for i in roll]): + if not self._cyclic.issuperset([axes[i] for i in roll]): raise IndexError( "Can't do a cyclic assignment to a non-cyclic axis" ) roll_axes = tuple(roll.keys()) shifts = tuple(roll.values()) - self.roll(axis=roll_axes, shift=shifts, inplace=True) - - # TODODASK: multiple lists + self.roll(shift=shifts, axis=roll_axes, inplace=True) # Make sure that the units of value are the same as self - try: - value_units = value.Units - except AttributeError: - pass - else: - self_units = self.Units - if value_units.equivalent(self_units): - if value_units != self_units: - value = value.copy() - value.Units = self.Units - elif value_units and self_units: - raise ValueError( - f"Can't assign values with units {value_units!r} " - f"to data with units {self_units!r}" - ) - # --- End: try - - # Set the mask hardness, in case any previous operations have - # inadvertently changed it. - self._set_mask_hardness() + value = conform_units(value, self.Units) # Do the assignment dx = self._get_dask() dx[indices] = dask_compatible(value) + # Unroll any axes that were rolled to enable a cyclic + # assignment if roll: - # Unroll any axes that were rolled to enable a cyclic - # assignment shifts = [-shift for shift in shifts] - self.roll(axis=roll_axes, shift=shifts, inplace=True) + self.roll(shift=shifts, axis=roll_axes, inplace=True) + + # Reset the mask hardness, otherwise it could be incorrect in + # the case that a chunk that was not a masked array is + # assigned missing values. + self._reset_mask_hardness() return + # ---------------------------------------------------------------- + # Indexing behaviour attributes + # ---------------------------------------------------------------- + @property + @daskified(1) + def __orthogonal_indexing__(self): + """Flag to indicate that orthogonal indexing is supported. + + Always True, indicating that 'orthogonal indexing' is + applied. This means that when indices are 1-d arrays or lists + then they subspace along each dimension independently. This + behaviour is similar to Fortran, but different to `numpy`. + + .. versionadded:: TODODASK + + .. seealso:: `__keepdims_indexing__`, `__getitem__`, + `__setitem__`, + `netCDF4.Variable.__orthogonal_indexing__` + + **Examples** + + >>> d = cf.Data([[1, 2, 3], + ... [4, 5, 6]]) + >>> e = d[[0], [0, 2]] + >>> e.shape + (1, 2) + >>> print(e.array) + [[1 3]] + >>> e = d[[0, 1], [0, 2]] + >>> e.shape + (2, 2) + >>> print(e.array) + [[1 3] + [4 6]] + + """ + return True + + @property + @daskified(1) + def __keepdims_indexing__(self): + """Flag to indicate whether dimensions indexed with integers are kept. + + If set to True (the default) then providing a single integer + as a single-axis index does *not* reduce the number of array + dimensions by 1. This behaviour is different to `numpy`. + + If set to False then providing a single integer as a + single-axis index reduces the number of array dimensions by + 1. This behaviour is the same as `numpy`. + + .. versionadded:: TODODASK + + .. seealso:: `__orthogonal_indexing__`, `__getitem__`, + `__setitem__` + + **Examples** + + >>> d = cf.Data([[1, 2, 3], + ... [4, 5, 6]]) + >>> d.__keepdims_indexing__ + True + >>> e = d[0] + >>> e.shape + (1, 3) + >>> print(e.array) + [[1 2 3]] + + >>> d.__keepdims_indexing__ + True + >>> e = d[:, 1] + >>> e.shape + (2, 1) + >>> print(e.array) + [[2] + [5]] + + >>> d.__keepdims_indexing__ + True + >>> e = d[0, 1] + >>> e.shape + (1, 1) + >>> print(e.array) + [[2]] + + >>> d.__keepdims_indexing__ = False + >>> e = d[0] + >>> e.shape + (3,) + >>> print(e.array) + [1 2 3] + + >>> d.__keepdims_indexing__ + False + >>> e = d[:, 1] + >>> e.shape + (2,) + >>> print(e.array) + [2 5] + + >>> d.__keepdims_indexing__ + False + >>> e = d[0, 1] + >>> e.shape + () + >>> print(e.array) + 2 + + """ + return self._custom.get("__keepdims_indexing__", True) + + @__keepdims_indexing__.setter + def __keepdims_indexing__(self, value): + self._custom["__keepdims_indexing__"] = bool(value) + # ---------------------------------------------------------------- # Private dask methods # ---------------------------------------------------------------- def _get_dask(self): """Get the dask array. - .. versionadded:: 4.0.0 + .. versionadded:: TODODASK + + .. seealso:: `_set_dask`, `_del_dask` :Returns: @@ -1115,10 +1338,14 @@ def _get_dask(self): """ return self._custom["dask"] - def _set_dask(self, array, copy=False, delete_source=True): + def _set_dask( + self, array, copy=False, delete_source=True, reset_mask_hardness=True + ): """Set the dask array. - .. versionadded:: 4.0.0 + .. versionadded:: TODODASK + + .. seealso:: `_get_dask`, `_del_dask`, `_reset_mask_hardness` :Parameters: @@ -1130,7 +1357,14 @@ def _set_dask(self, array, copy=False, delete_source=True): default the dask array is not copied. delete_source: `bool`, optional - TODODASK + If False then do not delete a compressed source array, + if one exists, after setting the new dask array. By + default a compressed source array is deleted. + + reset_mask_hardness: `bool`, optional + If False then do not reset the mask hardness after + setting the new dask array. By default the mask + hardness is re-applied. :Returns: @@ -1147,10 +1381,15 @@ def _set_dask(self, array, copy=False, delete_source=True): # guarantee its consistency with the new dask array. self._del_Array(None) + if reset_mask_hardness: + self._reset_mask_hardness() + def _del_dask(self, default=ValueError(), delete_source=True): """Remove the dask array. - .. versionadded:: 4.0.0 + .. versionadded:: TODODASK + + .. seealso:: `_set_dask`, `_get_dask` :Parameters: @@ -1199,21 +1438,27 @@ def _del_dask(self, default=ValueError(), delete_source=True): return out - def _dask_map_blocks(self, func, **kwargs): - """Apply a function in-place to the dask array using - `map_blocks`. + def _map_blocks(self, func, **kwargs): + """Apply a function to the data in-place. - .. versionadded:: 4.0.0 + .. note:: This method does not reset the mask hardness. It may + be necessary for a call to `_map_blocks` to be + followed by a call to `_reset_mask_hardness`. + + .. versionadded:: TODODASK + + .. seealso:: `_reset_mask_hardness` :Parameters: func: - The funciton to be applied to each chunk of the dask - array. + The funciton to be applied to the data, via + `dask.array.Array.map_blocks`, to each chunk of the + dask array. kwargs: optional - Keyword arguments passed to the map `map_blocks` - method of the dask array. + Keyword arguments passed to the + `dask.array.Array.map_blocks` method. :Returns: @@ -1223,17 +1468,31 @@ def _dask_map_blocks(self, func, **kwargs): **Examples:** >>> d = cf.Data([1, 2, 3]) - >>> dx = d._dask_map_blocks(lambda x: x / 2, dtype=float) + >>> dx = d._map_blocks(lambda x: x / 2) >>> print(d.array) [0.5 1. 1.5] """ dx = self._get_dask() dx = dx.map_blocks(func, **kwargs) - self._set_dask(dx) + self._set_dask(dx, reset_mask_hardness=False) return dx + def _reset_mask_hardness(self): + """Re-apply the mask hardness to the dask array. + + .. versionadded:: TODODASK + + .. seealso:: `hardmask`, `harden_mask`, `soften_mask` + + :Returns: + + `None` + + """ + self.hardmask = self.hardmask + @_inplace_enabled(default=False) def diff(self, axis=-1, n=1, inplace=False): """Calculate the n-th discrete difference along the given axis. @@ -2012,7 +2271,7 @@ def persist(self, inplace=False): dx = self._get_dask() dx = dx.persist() - d._set_dask(dx) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -2295,14 +2554,14 @@ def loadd(self, d, chunk=True): self._size = reduce(mul, shape, 1) cyclic = d.get("_cyclic", None) - # Never change the _cyclic attribute in-place + # Never change the value of the _cyclic attribute in-place if cyclic: self._cyclic = cyclic.copy() else: self._cyclic = _empty_set HDF_chunks = d.get("_HDF_chunks", None) - # Never change the _HDF_chunks attribute in-place + # Never change the value of the _HDF_chunks attribute in-place if HDF_chunks: self._HDF_chunks = HDF_chunks.copy() else: @@ -2951,7 +3210,7 @@ def rechunk( dx = d._get_dask() dx = dx.rechunk(chunks, threshold, block_size_limit, balance) - d._set_dask(dx, delete_source=False) + d._set_dask(dx, delete_source=False, reset_mask_hardness=False) return d @@ -4217,7 +4476,7 @@ def _unary_operation(self, operation): dx = self._get_dask() dx = getattr(operator, operation)(dx) - out._set_dask(dx) + out._set_dask(dx, reset_mask_hardness=False) return out @@ -5414,13 +5673,12 @@ def _Units(self): def _cyclic(self): """Storage for axis cyclicity. - Identifies which axes are cyclic (and therefore allow cyclic - slicing). - - It must be a subset of the axis identifiers given by the - `_axes` attribute. + Contains a `set` that identifies which axes are cyclic (and + therefore allow cyclic slicing). The set contains a subset of + the axis identifiers defined by the `_axes` attribute. - .. warning:: Never change the `_cyclic` attribute in-place. + .. warning:: Never change the value of the `_cyclic` attribute + in-place. .. note:: When an axis identifier is removed from the `_axes` attribute then it is automatically also removed from @@ -5455,56 +5713,47 @@ def _HDF_chunks(self): del self._custom["_HDF_chunks"] @property + @daskified(1) def _hardmask(self): - """TODODASK.""" + """Storage for the mask hardness. + + Contains a `bool`, where `True` denotes a hard mask and + `False` denotes a soft mask. + + See `hardmask` for details. + + """ return self._custom["_hardmask"] @_hardmask.setter def _hardmask(self, value): self._custom["_hardmask"] = value - @_hardmask.deleter - def _hardmask(self): - del self._custom["_hardmask"] - @property @daskified(1) def _axes(self): """Storage for the axis identifiers. - Contains an ordered sequence of unique (within this `Data` - instance) identifiers for each array axis. + Contains a `tuple` of identifiers, one for each array axis. - .. note:: When an axis identifier is removed from the `_axes` - attribute then it is automatically also removed from - the `_cyclic` attribute. + .. note:: When the axis identifiers are reset, then any axis + identifier named by the `_cyclic` attribute which is + not in the new `_axes` set is automatically removed + from the `_cyclic` attribute. """ return self._custom["_axes"] @_axes.setter def _axes(self, value): - value = tuple(value) - self._custom["_axes"] = value + self._custom["_axes"] = tuple(value) - # Update cyclic: Remove cyclic axes that are not in the new - # axes + # Remove cyclic axes that are not in the new axes cyclic = self._cyclic if cyclic: + # Never change the value of the _cyclic attribute in-place self._cyclic = cyclic.intersection(value) - @property - def numpy_indexing(self): - """TODODASK - probably thewrong name - need a gneral numpy - compatability flag. See also confg settings - - """ - return self._custom.get("numpy_indexing", False) - - @numpy_indexing.setter - def numpy_indexing(self, value): - self._custom["numpy_indexing"] = bool(value) - # ---------------------------------------------------------------- # Dask attributes # ---------------------------------------------------------------- @@ -5572,9 +5821,9 @@ def Units(self, value): dtype = _dtype_float32 else: dtype = _dtype_float - # --- End: if - self._dask_map_blocks( + # Apply the units conversion to the data + self._map_blocks( partial( Units.conform, from_units=old_units, @@ -5646,10 +5895,11 @@ def dtype(self): def dtype(self, value): dx = self._get_dask() - # Only change the datatype if it's different + # Only change the datatype if it's different to that of the + # dask array if dx.dtype != value: dx = dx.astype(value) - self._set_dask(dx) + self._set_dask(dx, reset_mask_hardness=False) @property def fill_value(self): @@ -5687,16 +5937,22 @@ def hardmask(self): """Hardness of the mask. If the `hardmask` attribute is `True`, i.e. there is a hard - mask, then unmasking an entry will silently not occur. This - feature prevents overwriting the mask. To allow the unmasking - of an entries when the data has a hard mask, the mask must - first to be softened using the `soften_mask` method, that - changes the `hardmask` attribute to `False`. The mask can be - re-hardened with `harden_mask` method. + mask, then unmasking an entry will silently not occur. This is + the default, and prevents overwriting the mask. - The following operations + If the `hardmask` attribute is `False`, i.e. there is a soft + mask, then masked entries may be overwritten with non-missing + values. - .. seealso:: `harden_mask`, `soften_mask`, `where` + To allow the unmasking of masked values, the mask must be + softened by setting the `hardmask` attribute to False, or + equivalently with the `soften_mask` method. + + The mask can be hardened by setting the `hardmask` attribute + to True, or equivalently with the `harden_mask` method. + + .. seealso:: `harden_mask`, `soften_mask`, `where`, + `__setitem__` **Examples:** @@ -5706,36 +5962,25 @@ def hardmask(self): >>> d[0] = cf.masked >>> print(d.array) [-- 2 3] - >>> d[...]= 9 + >>> d[...]= 999 >>> print(d.array) - [-- 9 9] - >>> d.soften_mask() + [-- 999 999] + >>> d.hardmask = False >>> d.hardmask False >>> d[...] = -1 - False >>> print(d.array) [-1 -1 -1] - >>> d.harden_mask() - >>> d.hardmask - True - >>> d[0] = cf.masked - >>> d = d.where(True, 9) - >>> print(d.array) - [-- 9 9] - >>> d.soften_mask() - >>> d.hardmask - False - >>> d = d.where(True, 9) - >>> print(d.array) - [9 9 9] """ - return self._custom["_hardmask"] + return self._hardmask @hardmask.setter def hardmask(self, value): - raise AttributeError("TODODASK - use harden_mask/soften_mask instead") + if value: + self.harden_mask() + else: + self.soften_mask() @property @daskified(1) @@ -5744,7 +5989,7 @@ def is_masked(self): **Performance** - `is_masked` causes all delayed operations to be computed. + `is_masked` causes all delayed operations to be executed. **Examples:** @@ -5763,7 +6008,7 @@ def is_masked(a): dx = self._get_dask() - out_ind = tuple(dx.ndim) + out_ind = tuple(range(dx.ndim)) dx_ind = out_ind dx = da.blockwise( @@ -5803,6 +6048,11 @@ def nbytes(self): Does not include bytes consumed by the array mask + **Performance** + + If the number of bytes is unknown then it is calculated + immediately by executing all delayed operations. + **Examples:** >>> d = cf.Data([[1, 1.5, 2]]) @@ -5820,9 +6070,13 @@ def nbytes(self): """ dx = self._get_dask() - return dx.nbytes + if math.isnan(dx.size): + logger.warning( + "Computing data nbytes: Performance may be degraded" + ) + dx.compute_chunk_sizes() - # TODODASK - what about nans (e.g. after da.unique) + return dx.nbytes @property @daskified(1) @@ -5860,6 +6114,11 @@ def ndim(self): def shape(self): """Tuple of the data array's dimension sizes. + **Performance** + + If the shape of the data is unknown then it is calculated + immediately by executing all delayed operations. + **Examples:** >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) @@ -5880,16 +6139,22 @@ def shape(self): """ dx = self._get_dask() - # if nan: do some compute - return dx.shape + if math.isnan(dx.size): + logger.warning("Computing data shape: Performance may be degraded") + dx.compute_chunk_sizes() - # TODODASK - what about nans (e.g. after da.unique dx.shape -> (nan,)) + return dx.shape @property @daskified(1) def size(self): """Number of elements in the data array. + **Performance** + + If the size of the data is unknown then it is calculated + immediately by executing all delayed operations. + **Examples:** >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) @@ -5914,9 +6179,13 @@ def size(self): """ dx = self._get_dask() - return dx.size + size = dx.size + if math.isnan(size): + logger.warning("Computing data size: Performance may be degraded") + dx.compute_chunk_sizes() + size = dx.size - # TODODASK - what about nans (e.g. after da.unique) + return size @property @daskified(1) @@ -5958,7 +6227,6 @@ def array(self): a.harden_mask() else: a.soften_mask() - # --- End: if return a @@ -6031,7 +6299,6 @@ def datetime_array(self): a.harden_mask() else: a.soften_mask() - # --- End: if return a @@ -8382,7 +8649,7 @@ def cyclic(self, axes=None, iscyclic=True): axes = [data_axes[i] for i in self._parse_axes(axes)] - # Never change the _cyclic attribute in-place + # Never change the value of the _cyclic attribute in-place if iscyclic: self._cyclic = cyclic_axes.union(axes) else: @@ -8921,7 +9188,7 @@ def insert_dimension(self, position=0, inplace=False): dx = d._get_dask() dx = dx.reshape(shape) - d._set_dask(dx) + d._set_dask(dx, reset_mask_hardness=False) # Expand _axes axis = new_axis_identifier(d._axes) @@ -9355,15 +9622,14 @@ def harden_mask(self): """Force the mask to hard. Whether the mask of a masked array is hard or soft is - determined by its the `hardmask` property. `harden_mask` sets - `hardmask` to True. + determined by its `hardmask` property. `harden_mask` sets + `hardmask` to `True`. + .. versionadded:: TODODASK - **Examples:** + .. seealso:: `hardmask`, `soften_mask` - >>> d = cf.Data([1, 2, 3]) - >>> d.hardmask - True + **Examples:** >>> d = cf.Data([1, 2, 3], hardmask=False) >>> d.hardmask @@ -9372,24 +9638,27 @@ def harden_mask(self): >>> d.hardmask True - """ - - def harden_mask(a): - if np.ma.isMA(a): - a.harden_mask() + >>> d = cf.Data([1, 2, 3], mask=[False, True, False]) + >>> d.hardmask + True + >>> d[1] = 999 + >>> print(d.array) + [1 -- 3] - return a + """ + self._map_blocks(_cf_harden_mask, dtype=self.dtype) + self._hardmask = True - if self._hardmask: - # Mask is already hard - return + def soften_mask(self): + """Force the mask to soft. - self._dask_map_blocks(harden_mask, dtype=self.dtype) + Whether the mask of a masked array is hard or soft is + determined by its `hardmask` property. `soften_mask` sets + `hardmask` to `False`. - self._hardmask = True + .. versionadded:: TODODASK - def soften_mask(self): - """TODODASK. + .. seealso:: `hardmask`, `harden_mask` **Examples:** @@ -9400,32 +9669,17 @@ def soften_mask(self): >>> d.hardmask False - """ - - def soften_mask(a): - if np.ma.isMA(a): - a.soften_mask() - - return a - - if not self._hardmask: - # Mask is already soft - return - - self._dask_map_blocks(soften_mask, dtype=self.dtype) + >>> d = cf.Data([1, 2, 3], mask=[False, True, False], hardmask=False) + >>> d.hardmask + False + >>> d[1] = 999 + >>> print(d.array) + [ 1 999 3] + """ + self._map_blocks(_cf_soften_mask, dtype=self.dtype) self._hardmask = False - def _set_mask_hardness(self, hardmask=None): - """TODODASK.""" - if hardmask is None: - hardmask = self.hardmask - - if hardmask: - self.harden_mask() - else: - self.soften_mask() - @_inplace_enabled(default=False) def filled(self, fill_value=None, inplace=False): """Replace masked elements with the fill value. @@ -10110,7 +10364,8 @@ def datum(self, *index): """Return an element of the data array as a standard Python scalar. - TODODASK consider renameing/aliasing to 'item' + TODODASK: consider renameing/aliasing to 'item'. Might depend + on whether or not the APIs are the same. The first and last elements are always returned with ``d.datum(0)`` and ``d.datum(-1)`` respectively, even if the data @@ -10566,7 +10821,7 @@ def flip(self, axes=None, inplace=False, i=False): dx = d._get_dask() dx = dx[tuple(index)] - d._set_dask(dx) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -10589,8 +10844,8 @@ def HDF_chunks(self, *chunks): chunks = chunks[0] if chunks is None: - # Clear all chunking. Never change the _HDF_chunks - # attribute in-place. + # Clear all chunking. Never change the value of the + # _HDF_chunks attribute in-place. self._HDF_chunks = None return org_HDF_chunks @@ -10601,7 +10856,7 @@ def HDF_chunks(self, *chunks): if _HDF_chunks.values() == [None] * self.ndim: _HDF_chunks = None - # Never change the _HDF_chunks attribute in-place + # Never change the value of the _HDF_chunks attribute in-place self._HDF_chunks = _HDF_chunks return org_HDF_chunks @@ -11978,14 +12233,14 @@ def squeeze(self, axes=None, inplace=False, i=False): # one size 1 axis needs squeezing. dx = d._get_dask() dx = dx.squeeze(axis=tuple(axes)) - d._set_dask(dx) + d._set_dask(dx, reset_mask_hardness=False) # Remove the squeezed axes names d._axes = [axis for i, axis in enumerate(d._axes) if i not in axes] hdf = self._HDF_chunks if hdf: - # Never change the _HDF_chunks attribute in-place + # Never change the value of the _HDF_chunks attribute in-place self._HDF_chunks = { axis: size for axis, size in hdf.items() if axis not in axes } @@ -12507,6 +12762,14 @@ def roll(self, axis, shift, inplace=False, i=False): *Parameter example:* Convolve the last axis: ``axis=-1``. + shift: `int`, or `tuple` of `int` + The number of places by which elements are shifted. + If a `tuple`, then *axis* must be a tuple of the same + size, and each of the given axes is shifted by the + corresponding number. If an `int` while *axis* is a + tuple of `int`, then the same value is used for all + given axes. + {{inplace: `bool`, optional}} {{i: deprecated at version 3.0.0}} @@ -12516,22 +12779,13 @@ def roll(self, axis, shift, inplace=False, i=False): `Data` or `None` """ - d = _inplace_enabled_define_and_cleanup(self) - - if isinstance(shift, int): - shift = (shift,) + # TODODASK - consider matching the numpy/dask api: "shift, axis=" - axes = self._parse_axes(axis) - if len(axes) != len(shift): - raise ValueError("TODODASK") - - if not shift: - # Null roll - return d + d = _inplace_enabled_define_and_cleanup(self) dx = d._get_dask() - dx = da.roll(dx, axis=axes, shift=shift) - d._set_dask(dx) + dx = da.roll(dx, shift, axis=axis) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -13325,3 +13579,54 @@ def _broadcast(a, shape): tile = shape[0 : len(shape) - len(a_shape)] + tuple(tile[::-1]) return np.tile(a, tile) + + +"""Dask utilities to be called on chunks""" + + +def _cf_harden_mask(a): + """Harden the mask of a masked `numpy` array. + + Has no effect if the array is not a masked array. + + :Parameters: + + a: `numpy.ndarray` + The array to have a hardened mask. + + :Returns: + + `numpy.ndarray` + The array with hardened mask. + + :Returns: + + `numpy.ndarray` + + """ + if np.ma.isMA(a): + a.harden_mask() + + return a + + +def _cf_soften_mask(a): + """Soften the mask of a maked `numpy` array. + + Has no effect if the array is not a masked array. + + :Parameters: + + a: `numpy.ndarray` + The array to have a softened mask. + + :Returns: + + `numpy.ndarray` + The array with softened mask. + + """ + if np.ma.isMA(a): + a.soften_mask() + + return a diff --git a/cf/data/utils.py b/cf/data/utils.py index d7a150eba1..fb077ebac9 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -1,14 +1,12 @@ """General functions useful for `Data` functionality.""" -from functools import partial +from functools import lru_cache, partial from itertools import product -import numpy as np - import dask.array as da +import numpy as np -from ..cfdatetime import dt2rt, st2rt, rt2dt from ..cfdatetime import dt as cf_dt - +from ..cfdatetime import dt2rt, rt2dt, st2rt from ..units import Units @@ -93,7 +91,6 @@ def convert_to_reftime(array, units, first_value=None): ) else: d_calendar = x_calendar - # --- End: if if not units: # Set the units to something that is (hopefully) @@ -210,12 +207,13 @@ def _get_calendar(x): return set(cals.tolist()) +@lru_cache(maxsize=32) def new_axis_identifier(existing_axes=(), basename="dim"): - """Return a new, unique axis identifiers. + """Return a new, unique axis identifier. The name is arbitrary and has no semantic meaning. - .. versionadded:: 4.0.0 + .. versionadded:: TODODASK :Parameters: @@ -365,6 +363,71 @@ def dask_compatible(a): """ try: - return a.data._get_data() + return a.data._get_dask() except AttributeError: return a + + +def conform_units(value, units): + """Conform units. + + If *value* has units defined by its `Units` attribute then + + * if the value units are equal to *units* then *value* is returned + unchanged; + + * if the value units are equivalent to *units* then a copy of + *value* convert to *units* is returned; + + * if the value units are not equivalent to *units* then an + exception is raised. + + In all other cases *value* is returned unchanged. + + .. versionadded:: TODODASK + + :Parameters: + + value: + The value whose units are to be conformed to *units*. + + units: `Units` + The units to conform to. + + **Examples:** + + >>> conform_units(1, cf.Units('metres')) + 1 + >>> conform_units([1, 2, 3], cf.Units('metres')) + [1, 2, 3] + >>> import numpy + >>> conform_units(numpy.array([1, 2, 3]), cf.Units('metres')) + array([1, 2, 3]) + >>> conform_units('string', cf.Units('metres')) + 'string' + >>> d = cf.Data([1, 2] , 'm') + >>> conform_units(d, cf.Units('metres')) + + >>> d = cf.Data([1, 2] , 'km') + >>> conform_units(d, cf.Units('metres')) + + >>> conform_units(d, cf.Units('s')) + ... + ValueError: Units are incompatible with units + + """ + try: + value_units = value.Units + except AttributeError: + pass + else: + if value_units.equivalent(units): + if value_units != units: + value = value.copy() + value.Units = units + elif value_units and units: + raise ValueError( + f"Units {value_units!r} are incompatible with units {units!r}" + ) + + return value diff --git a/cf/field.py b/cf/field.py index d40ed264f4..cf7e46ac80 100644 --- a/cf/field.py +++ b/cf/field.py @@ -460,13 +460,9 @@ def __getitem__(self, indices): f"{self.constructs.domain_axis_identity(_)!r} axis" ) - logger.debug( - f" roll, iaxis, shift = {roll} {iaxis} {shift}" - ) # pragma: no cover - - new = new.roll(iaxis, shift) + new = new.roll(shift=shift, axis=iaxis) else: - new = self.copy() + new = self.copy(array=False) # ------------------------------------------------------------ # Subspace the field construct's data @@ -482,7 +478,7 @@ def __getitem__(self, indices): logger.debug(" indices2 = {}".format(indices2)) # pragma: no cover logger.debug(" findices = {}".format(findices)) # pragma: no cover - new_data = new.data[tuple(findices)] + new_data = data[tuple(findices)] # Set sizes of domain axes data_axes = new.get_data_axes() diff --git a/cf/functions.py b/cf/functions.py index f8d4cfd08c..f45ca37429 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -13,6 +13,7 @@ from hashlib import md5 as hashlib_md5 from marshal import dumps as marshal_dumps from math import ceil as math_ceil +from numbers import Integral from os import getpid, listdir, mkdir from os.path import abspath as _os_path_abspath from os.path import dirname as _os_path_dirname @@ -29,17 +30,11 @@ from numpy import __version__ as _numpy__version__ from numpy import all as _numpy_all from numpy import allclose as _x_numpy_allclose -from numpy import array as _numpy_array from numpy import ascontiguousarray as _numpy_ascontiguousarray -from numpy import integer as _numpy_integer from numpy import isclose as _x_numpy_isclose -from numpy import ndim as _numpy_ndim from numpy import shape as _numpy_shape -from numpy import sign as _numpy_sign -from numpy import size as _numpy_size from numpy import take as _numpy_take from numpy import tile as _numpy_tile -from numpy import where as _numpy_where from numpy.ma import all as _numpy_ma_all from numpy.ma import allclose as _numpy_ma_allclose from numpy.ma import is_masked as _numpy_ma_is_masked @@ -1880,7 +1875,7 @@ def _numpy_isclose(a, b, rtol=None, atol=None): # TODODASK - sort out the "numpy" environment -def parse_indices(shape, indices, cyclic=False, numpy_indexing=False): +def parse_indices(shape, indices, cyclic=False, keepdims=True): """TODODASK. :Parameters: @@ -1889,33 +1884,31 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False): indices: `tuple` (not a `list`!) + keepdims: `bool`, optional + If True then an integral index is converted to a + slice. For instance, ``3`` would become ``slice(3, 4)``. + :Returns: `list` [, `dict`] + The parsed indices. If *cyclic* is True the a dictionary + is also returned that contains the parameters needed to + interpret any cyclic slices. **Examples:** >>> cf.parse_indices((5, 8), ([1, 2, 4, 6],)) [array([1, 2, 4, 6]), slice(0, 8, 1)] - >>> cf.parse_indices((5, 8), ([2, 4, 6],)) - [slice(2, 7, 2), slice(0, 8, 1)] + >>> cf.parse_indices((5, 8), (Ellipsis, [2, 4, 6])) + [slice(0, 5, 1), slice(2, 7, 2)] """ parsed_indices = [] roll = {} - flip = [] - compressed_indices = [] - mask_indices = [] if not isinstance(indices, tuple): indices = (indices,) - # if mask and indices: - # arg0 = indices[0] - # if isinstance(arg0, str) and arg0 == 'mask': - # mask_indices = indices[1] - # indices = indices[2:] - # Initialize the list of parsed indices as the input indices with any # Ellipsis objects expanded length = len(indices) @@ -1936,36 +1929,20 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False): if ndim and len_parsed_indices > ndim: raise IndexError( - "Invalid indices {} for array with shape {}".format( - parsed_indices, shape - ) + f"Invalid indices {parsed_indices} for array with shape {shape}" ) if len_parsed_indices < ndim: parsed_indices.extend([slice(None)] * (ndim - len_parsed_indices)) if not ndim and parsed_indices: - # # If data is scalar then allow it to be indexed with an - # # equivalent to [0] - # if (len_parsed_indices == 1 and - # parsed_indices[0] in (0, - # -1, - # slice(0, 1), - # slice(-1, None, -1), - # slice(None, None, None))): - # parsed_indices = [] - # else: raise IndexError( "Scalar array can only be indexed with () or Ellipsis" ) for i, (index, size) in enumerate(zip(parsed_indices, shape)): - is_slice = False - if isinstance(index, slice): - # -------------------------------------------------------- - # Index is a slice - # -------------------------------------------------------- - is_slice = True + if cyclic and isinstance(index, slice): + # Check for a cyclic slice start = index.start stop = index.stop step = index.step @@ -2015,11 +1992,8 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False): # -9:0:1 => [1, 2, 3, 4, 5, 6, 7, 8, 9] # -9:1:1 => [1, 2, 3, 4, 5, 6, 7, 8, 9, 0] # -10:0:1 => [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - if cyclic and not numpy_indexing: - index = slice(0, stop - start, step) - roll[i] = -start - else: - index = slice(start, stop, step) + index = slice(0, stop - start, step) + roll[i] = -start elif step < 0 and 0 <= start < size and start - size <= stop < 0: # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] @@ -2029,207 +2003,22 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False): # 6:-4:-1 => [6, 5, 4, 3, 2, 1, 0, 9, 8, 7] # 0:-2:-1 => [0, 9] # 0:-10:-1 => [0, 9, 8, 7, 6, 5, 4, 3, 2, 1] - if cyclic and not numpy_indexing: - index = slice(start - stop - 1, None, step) - roll[i] = -1 - stop - else: - index = slice(start, stop, step) + index = slice(start - stop - 1, None, step) + roll[i] = -1 - stop + elif keepdims and isinstance(index, Integral): + # Convert an integral index to a slice + if index == -1: + index = slice(-1, None, None) else: - start, stop, step = index.indices(size) - if ( - start == stop - or (start < stop and step < 0) - or (start > stop and step > 0) - ): - raise IndexError( - "Invalid indices dimension with size {}: {}".format( - size, index - ) - ) - - if step < 0 and stop < 0: - stop = None - index = slice(start, stop, step) - - elif isinstance(index, (int, _numpy_integer)): - # -------------------------------------------------------- - # Index is an integer - # -------------------------------------------------------- - if index < 0: - index += size - - if not numpy_indexing: - index = slice(index, index + 1, 1) - is_slice = True - else: - convert2positve = True - if getattr( - getattr(index, "dtype", None), "kind", None - ) == "b" or isinstance(index[0], bool): - # ---------------------------------------------------- - # Index is a sequence of booleans - # ---------------------------------------------------- - # Convert booleans to non-negative integers. We're - # assuming that anything with a dtype attribute also - # has a size attribute. - if _numpy_size(index) != size: - raise IndexError( - "Incorrect number ({}) of boolean indices for " - "dimension with size {}: {}".format( - _numpy_size(index), size, index - ) - ) - - index = _numpy_where(index)[0] - convert2positve = False - - if not _numpy_ndim(index): - if index < 0: - index += size - index = slice(index, index + 1, 1) - is_slice = True - else: - len_index = len(index) - if len_index == 1: - index = index[0] - if index < 0: - index += size - - index = slice(index, index + 1, 1) - is_slice = True - elif len_index: - if convert2positve: - # Convert to non-negative integer numpy array - index = _numpy_array(index) - index = _numpy_where(index < 0, index + size, index) - - steps = index[1:] - index[:-1] - step = steps[0] - if step and not (steps - step).any(): - # Replace the numpy array index with a slice - if step > 0: - start, stop = index[0], index[-1] + 1 - elif step < 0: - start, stop = index[0], index[-1] - 1 - - if stop < 0: - stop = None - - index = slice(start, stop, step) - is_slice = True - else: - if ( - (step > 0 and (steps <= 0).any()) - or (step < 0 and (steps >= 0).any()) - or not step - ): - raise ValueError( - "Bad index (not strictly monotonic): " - "{}".format(index) - ) - - # if reverse and step < 0: - # # The array is strictly monotonically - # # decreasing, so reverse it so that it's - # # strictly monotonically increasing. Make - # # a note that this dimension will need - # # flipping later - # index = index[::-1] - # flip.append(i) - # step = -step - # - # if envelope: - # # Create an envelope slice for a parsed - # # index of a numpy array of integers - # compressed_indices.append(index) - # - # step = _numpy_sign(step) - # if step > 0: - # stop = index[-1] + 1 - # else: - # stop = index[-1] - 1 - # if stop < 0: - # stop = None - # - # index = slice(index[0], stop, step) - # is_slice = True - else: - raise IndexError( - "Invalid indices {} for array with shape {}".format( - parsed_indices, shape - ) - ) - - if is_slice: - # if reverse and index.step < 0: - # # If the slice step is negative, then transform - # # the original slice to a new slice with a - # # positive step such that the result of the new - # # slice is the reverse of the result of the - # # original slice. - # # - # # For example, if the original slice is - # # slice(6,0,-2) then the new slice will be - # # slice(2,7,2): - # # - # # >>> a = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - # # >>> a[slice(6, 0, -2)] - # # [6, 4, 2] - # # >>> a[slice(2, 7, 2)] - # # [2, 4, 6] - # # a[slice(6, 0, -2)] == list(reversed(a[slice(2, 7, 2)])) - # # True - # start, stop, step = index.indices(size) - # step *= -1 - # div, mod = divmod(start-stop-1, step) - # div_step = div*step - # start -= div_step - # stop = start + div_step + 1 - # - # index = slice(start, stop, step) - # flip.append(i) - # # --- End: if - - # If step is greater than one then make sure that - # index.stop isn't bigger than it needs to be - if cyclic and index.step > 1: - start, stop, step = index.indices(size) - div, mod = divmod(stop - start - 1, step) - stop = start + div * step + 1 - index = slice(start, stop, step) - - # - # if envelope: - # # Create an envelope slice for a parsed - # # index of a numpy array of integers - # compressed_indices.append(index) - # index = slice( - # start, stop, (1 if reverse else _numpy_sign(step))) - # --- End: if parsed_indices[i] = index - # if not (cyclic or reverse or envelope or mask): if not cyclic: return parsed_indices - out = [parsed_indices] - - if cyclic: - out.append(roll) - - # if reverse: - # out.append(flip) - # - # if envelope: - # out.append(compressed_indices) - # - # if mask: - # out.append(mask_indices) - - return out + return parsed_indices, roll def get_subspace(array, indices): diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 7f60ad8ab9..6dd4648ab5 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -820,11 +820,144 @@ def test_Data_squeeze_insert_dimension(self): a = f.array self.assertTrue(np.allclose(a, array)) - def test_Data___getitem__(self): + def test_Data__getitem__(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - def test_Data___setitem__(self): + d = cf.Data(np.ma.arange(450).reshape(9, 10, 5), chunks=(4, 5, 1)) + + for indices in ( + Ellipsis, + (slice(None), slice(None)), + (slice(None), Ellipsis), + (Ellipsis, slice(None)), + (Ellipsis, slice(None), Ellipsis), + ): + self.assertEqual(d[indices].shape, d.shape) + + for indices in ( + ([1, 3, 4], slice(None), [2, -1]), + (slice(0, 6, 2), slice(None), [2, -1]), + (slice(0, 6, 2), slice(None), slice(2, 5, 2)), + (slice(0, 6, 2), list(range(10)), slice(2, 5, 2)), + ): + self.assertEqual(d[indices].shape, (3, 10, 2)) + + for indices in ( + (slice(0, 6, 2), -2, [2, -1]), + (slice(0, 6, 2), -2, slice(2, 5, 2)), + ): + self.assertEqual(d[indices].shape, (3, 1, 2)) + + for indices in ( + ([1, 3, 4], -2, [2, -1]), + ([4, 3, 1], -2, [2, -1]), + ([1, 4, 3], -2, [2, -1]), + ([4, 1, 4], -2, [2, -1]), + ): + e = d[indices] + self.assertEqual(e.shape, (3, 1, 2)) + self.assertEqual(e._axes, d._axes) + + d.__keepdims_indexing__ = False + self.assertFalse(d.__keepdims_indexing__) + for indices in ( + ([1, 3, 4], -2, [2, -1]), + (slice(0, 6, 2), -2, [2, -1]), + (slice(0, 6, 2), -2, slice(2, 5, 2)), + ([1, 4, 3], -2, [2, -1]), + ([4, 3, 4], -2, [2, -1]), + ([1, 4, 4], -2, [2, -1]), + ): + e = d[indices] + self.assertFalse(e.__keepdims_indexing__) + self.assertEqual(e.shape, (3, 2)) + self.assertEqual(e._axes, d._axes[0::2]) + + self.assertFalse(d.__keepdims_indexing__) + d.__keepdims_indexing__ = True + self.assertTrue(d.__keepdims_indexing__) + + d = cf.Data(np.ma.arange(24).reshape(3, 8)) + e = d[0, 2:4] + + # Cyclic slices + d = cf.Data(np.ma.arange(24).reshape(3, 8)) + d.cyclic(1) + self.assertTrue((d[0, :6].array == [[0, 1, 2, 3, 4, 5]]).all()) + e = d[0, -2:4] + self.assertEqual(e._axes, d._axes) + self.assertEqual(e.shape, (1, 6)) + self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3]]).all()) + self.assertFalse(e.cyclic()) + + d.__keepdims_indexing__ = False + e = d[:, 4] + self.assertEqual(e.shape, (3,)) + self.assertFalse(e.cyclic()) + self.assertEqual(e._axes, d._axes[0:1]) + d.__keepdims_indexing__ = True + + e = d[0, -2:6] + self.assertEqual(e.shape, (1, 8)) + self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3, 4, 5]]).all()) + self.assertTrue(e.cyclic(), set([1])) + + with self.assertRaises(IndexError): + # Cyclic slice of non-cyclic axis + e = d[-1:1] + + d.cyclic(0) + e = d[-1:1, -2:-4] + self.assertEqual(e.shape, (2, 6)) + self.assertTrue((e[:, 0].array == [[22], [6]]).all()) + self.assertTrue((e[0].array == [[22, 23, 16, 17, 18, 19]]).all()) + self.assertFalse(e.cyclic()) + + e = d[-1:2, -2:4] + self.assertEqual(e.shape, (3, 6)) + self.assertEqual(e.cyclic(), set([0])) + e = d[-1:1, -2:6] + self.assertEqual(e.shape, (2, 8)) + self.assertEqual(e.cyclic(), set([1])) + e = d[-1:2, -2:6] + self.assertEqual(e.shape, (3, 8)) + self.assertEqual(e.cyclic(), set([0, 1])) + + d.cyclic(0, False) + d.__keepdims_indexing__ = False + e = d[0, :6] + self.assertFalse(e.__keepdims_indexing__) + self.assertEqual(e.shape, (6,)) + self.assertTrue((e.array == [0, 1, 2, 3, 4, 5]).all()) + e = d[0, -2:4] + self.assertEqual(e.shape, (6,)) + self.assertTrue((e.array == [6, 7, 0, 1, 2, 3]).all()) + self.assertFalse(e.cyclic()) + d.__keepdims_indexing__ = True + + # Keepdims indexing + d = cf.Data([[1, 2, 3], [4, 5, 6]]) + self.assertEqual(d[0].shape, (1, 3)) + self.assertEqual(d[:, 1].shape, (2, 1)) + self.assertEqual(d[0, 1].shape, (1, 1)) + d.__keepdims_indexing__ = False + self.assertEqual(d[0].shape, (3,)) + self.assertEqual(d[:, 1].shape, (2,)) + self.assertEqual(d[0, 1].shape, ()) + d.__keepdims_indexing__ = True + + # Orthogonal indexing + self.assertEqual(d[[0], [0, 2]].shape, (1, 2)) + self.assertEqual(d[[0, 1], [0, 2]].shape, (2, 2)) + self.assertEqual(d[[0, 1], [2]].shape, (2, 1)) + + # Ancillary masks + # + # TODODASK: Test __getitem__ with ancillary masks. Can only do + # this when cf.Data.where has been daskified + + def test_Data__setitem__(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -835,11 +968,10 @@ def test_Data___setitem__(self): else: a.soften_mask() - d = cf.Data(a, "m", chunks=(2, 2), hardmask=hardmask) + d = cf.Data(a.copy(), "metres", hardmask=hardmask, chunks=(3, 5)) - # Scalar assignment a[:, 1] = np.ma.masked - d[:, 1] = np.ma.masked + d[:, 1] = cf.masked a[0, 2] = -6 d[0, 2] = -6 @@ -862,16 +994,38 @@ def test_Data___setitem__(self): a[8, [8, 6, 5]] = -5 d[8, [8, 6, 5]] = -5 - self.assertTrue((d.array == a).all()) - self.assertTrue((d.array.mask == a.mask).all()) + a[...] = -a + d[...] = -d - # Non-scalar assignment (TODODASK - add more tests) a[0] = a[2] - d[0] = a[2] + d[0] = d[2] self.assertTrue((d.array == a).all()) self.assertTrue((d.array.mask == a.mask).all()) + # Units + a = np.ma.arange(90).reshape(9, 10) + d = cf.Data(a, "metres") + d[...] = cf.Data(a * 100, "cm") + self.assertTrue((d.array == a).all()) + self.assertTrue((d.array.mask == a.mask).all()) + + # Cyclic axes + d.cyclic(1) + self.assertTrue((d[0].array == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).all()) + d[0, -1:1] = [-99, -1] + self.assertTrue( + (d[0].array == [-1, 1, 2, 3, 4, 5, 6, 7, 8, -99]).all() + ) + self.assertEqual(d.cyclic(), set([1])) + + # Multiple list/1-d array indices + with self.assertRaises(NotImplementedError): + d[[1, 2], [0, 4, 1]] = 9 + + with self.assertRaises(NotImplementedError): + d[[1], [0, 4, 1]] = 9 + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_outerproduct(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: