From 3acffce895c9c23952e80bbabedcca39014dc519 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 1 Sep 2021 23:07:38 +0100 Subject: [PATCH 01/12] cyclic axes --- cf/data/data.py | 223 +++++++++++++++++++++++++++++++++---------- cf/functions.py | 176 ++-------------------------------- cf/test/test_Data.py | 96 +++++++++++++++++++ 3 files changed, 277 insertions(+), 218 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index edea334fe9..6d39ea30c3 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -4,6 +4,7 @@ from itertools import product from json import dumps as json_dumps from json import loads as json_loads +from numbers import Integral from operator import mul try: @@ -946,8 +947,6 @@ def __getitem__(self, indices): if indices is Ellipsis: return self.copy() - d = self - auxiliary_mask = () try: arg = indices[0] @@ -957,54 +956,125 @@ def __getitem__(self, indices): if isinstance(arg, str) and arg == "mask": auxiliary_mask = indices[1] indices = indices[2:] - + keepdims = self.__keepdims_indexing__ indices, roll = parse_indices( - d.shape, indices, cyclic=True, numpy_indexing=False + self.shape, + indices, + cyclic=True, + keepdims=keepdims, ) - # TODODASK - sort out the "numpy" environment - - # TODODASK - multiple list indices - - axes = d._axes - cyclic_axes = d._cyclic - + axes = self._axes + cyclic_axes = self._cyclic + print ('parsed_indices=', indices) if roll: # Roll axes with cyclic slices. For example, if slice(-2, - # 3) has been requested on a cyclic axis (and we're not - # using numpy indexing), then we roll that axis by two - # points and apply the slice(0, 5) instead. - if cyclic_axes.intersection([axes[i] for i in roll]): + # 3) has been requested on a cyclic axis, then we roll + # that axis by two points and apply the slice(0, 5) + # instead. + if not cyclic_axes.issuperset([axes[i] for i in roll]): raise IndexError( "Can't take a cyclic slice of a non-cyclic axis" ) - d = d.roll(axis=tuple(roll.keys()), shift=tuple(roll.values())) - new = d + new = self.roll( + axis=tuple(roll.keys()), shift=tuple(roll.values()) + ) + dx = new._get_dask() else: - new = d.copy(array=False) + new = self.copy(array=False) + dx = self._get_dask() + + # Subspace the array + if self.__orthogonal_indexing__: + # Apply 'orthogonal indexing': indices that are 1-d arrays + # or lists subspace along each dimension + # independently. This behaviour is similar to Fortran, but + # different to dask. + axes_with_list_indices = [ + i + for i, x in enumerate(indices) + if not isinstance(x, (slice, Integral)) + ] + n_axes_with_list_indices = len(axes_with_list_indices) - # Get the subspaced dask array - dx = d._get_dask() - dx = dx[tuple(indices)] + if n_axes_with_list_indices < 2: + # At most one axis has a list-of-integers index so do + # a normal dask subspace + dx = dx[tuple(indices)] + print (indices) + else: + # At least two axes have list-of-integers indices so + # we can't do a normal dask subspace + + # Subspace axes which have list-of-integer indices + for axis in axes_with_list_indices: + dx = da.take(dx, indices[axis], axis=axis) + + if n_axes_with_list_indices < len(indices): + # Subspace axes which didn't have list-of-integer + # indices. (Do this after subspacing axes which + # have list-of-integer indices in case + # __keepdims_indexing__ is False.) + slice_indices = [ + slice(None) if i in axes_with_list_indices else x + for i, x in enumerate(indices) + ] + dx = dx[tuple(slice_indices)] + + else: + raise ValueError("Orthogonal indexing is currently required.") + + # ------------------------------------------------------------ + # Note which axes have not been dropped from the result + # (i.e. those indexed with non-integral indices) + # ------------------------------------------------------------ + if keepdims: + axes_with_non_integral_indices = axes + shape = self.shape + else: + axes_with_non_integral_indices = [ + axis + for axis, x in zip(axes, indices) + if not isinstance(x, Integral) + ] + if len(axes_with_non_integral_indices) < len(indices): + # Never change the _axes attribute in-place + new._axes = axes_with_non_integral_indices + + if cyclic_axes: + cyclic_axes = set( + [axis + for axis in cyclic_axes + if axis in axes_with_non_integral_indices] + ) + shape = [n + for n, axis in zip(self.shape, axes) + if axis in axes_with_non_integral_indices] + + # Set the subspaced dask array new._set_dask(dx) - # Apply any auxiliary mask + # ------------------------------------------------------------ + # Apply auxiliary masks + # ------------------------------------------------------------ for mask in auxiliary_mask: new.where(mask, cf_masked, inplace=True) - # Cyclic axes which have been reduced in size are no longer - # cyclic + # ------------------------------------------------------------ + # Cyclic axes + # ------------------------------------------------------------ if cyclic_axes: - x = [ - axis - for axis, n0, n1 in zip(axes, d.shape, new.shape) - if n1 != n0 and axis in cyclic_axes - ] + # Cyclic axes that have been reduced in size are no longer + # considered to be cyclic + x = [axis + for axis, n0, n1 in zip(axes_with_non_integral_indices, + shape, + new.shape) + if n1 != n0 and axis in cyclic_axes] if x: # Never change the _cyclic attribute in-place new._cyclic = cyclic_axes.difference(x) - # --- End: if return new @@ -1042,10 +1112,20 @@ def __setitem__(self, indices, value): # TODODASK - sort out the "numpy" environment indices, roll = parse_indices( - self.shape, indices, cyclic=True, numpy_indexing=False + self.shape, indices, cyclic=True, keepdims=True ) indices = tuple(indices) + axes_with_list_indices = [ + True for x in indices if not isinstance(x, slice) + ] + if len(axes_with_list_indices) > 1: + raise IndexError( + "Currently limited to at most one dimension's assignment " + "index being a 1-d array of integers or booleans. " + f"Got: {indices}" + ) + if roll: # Roll axes with cyclic slices. For example, if assigning # to slice(-2, 3) has been requested on a cyclic axis (and @@ -1081,7 +1161,6 @@ def __setitem__(self, indices, value): f"Can't assign values with units {value_units!r} " f"to data with units {self_units!r}" ) - # --- End: try # Set the mask hardness, in case any previous operations have # inadvertently changed it. @@ -1099,6 +1178,61 @@ def __setitem__(self, indices, value): return + # ---------------------------------------------------------------- + # Indexing behaviour attributes + # ---------------------------------------------------------------- + @property + @daskified(1) + def __orthogonal_indexing__(self): + """Flag to indicate that orthogonal indexing is supported. + + Always True, indicating that 'orthogonal indexing' is + applied. This means that indices that are 1-d arrays or lists + subspace along each dimension independently. This behaviour is + similar to Fortran, but different to `numpy`. + + .. versionadded:: TODODASK + + .. seealso:: __keepdims_indexing__, + netCDF4.Variable.__orthogonal_indexing__ + + """ + return True + + @property + @daskified(1) + def __keepdims_indexing__(self): + """Flag to indicate whether orthogonal indexing is supported. + + If True then providing a single integer as a single-axis index + does *not* reduce the number of array dimensions by 1. This + behaviour is different to `numpy`. + + If False then providing a single integer as a single-axis + index reduces the number of array dimensions by 1. This + behaviour is the same as `numpy`. + + .. versionadded:: TODODASK + + .. seealso:: __orthogonal_indexing__ + + **Examples:** + + >>> d = cf.Data([[1, 2], [3, 4]]) + >>> d.__keepdims_indexing__ = True + >>> d[0].shape + (1, 2) + >>> d.__keepdims_indexing__ = False + >>> d[0].shape + (2,) + + """ + return self._custom.get("__keepdims_indexing__", True) + + @__keepdims_indexing__.setter + def __keepdims_indexing__(self, value): + self._custom["__keepdims_indexing__"] = bool(value) + # ---------------------------------------------------------------- # Private dask methods # ---------------------------------------------------------------- @@ -5472,12 +5606,13 @@ def _hardmask(self): def _axes(self): """Storage for the axis identifiers. - Contains an ordered sequence of unique (within this `Data` - instance) identifiers for each array axis. + Contains an ordered sequence of identifiers for each array + axis. - .. note:: When an axis identifier is removed from the `_axes` - attribute then it is automatically also removed from - the `_cyclic` attribute. + .. note:: When the axis identifiers are reset, then any axis + identifier named by the `_cyclic` attribute which is + not in the new `_axes` set is automatically removed + `_cyclic` attribute. """ return self._custom["_axes"] @@ -5488,23 +5623,11 @@ def _axes(self, value): self._custom["_axes"] = value # Update cyclic: Remove cyclic axes that are not in the new - # axes + # axes (never update _cyclic in-place) cyclic = self._cyclic if cyclic: self._cyclic = cyclic.intersection(value) - @property - def numpy_indexing(self): - """TODODASK - probably thewrong name - need a gneral numpy - compatability flag. See also confg settings - - """ - return self._custom.get("numpy_indexing", False) - - @numpy_indexing.setter - def numpy_indexing(self, value): - self._custom["numpy_indexing"] = bool(value) - # ---------------------------------------------------------------- # Dask attributes # ---------------------------------------------------------------- diff --git a/cf/functions.py b/cf/functions.py index f8d4cfd08c..d9c46e1872 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -13,6 +13,7 @@ from hashlib import md5 as hashlib_md5 from marshal import dumps as marshal_dumps from math import ceil as math_ceil +from numbers import Integral from os import getpid, listdir, mkdir from os.path import abspath as _os_path_abspath from os.path import dirname as _os_path_dirname @@ -29,14 +30,10 @@ from numpy import __version__ as _numpy__version__ from numpy import all as _numpy_all from numpy import allclose as _x_numpy_allclose -from numpy import array as _numpy_array from numpy import ascontiguousarray as _numpy_ascontiguousarray -from numpy import integer as _numpy_integer from numpy import isclose as _x_numpy_isclose from numpy import ndim as _numpy_ndim from numpy import shape as _numpy_shape -from numpy import sign as _numpy_sign -from numpy import size as _numpy_size from numpy import take as _numpy_take from numpy import tile as _numpy_tile from numpy import where as _numpy_where @@ -1880,7 +1877,7 @@ def _numpy_isclose(a, b, rtol=None, atol=None): # TODODASK - sort out the "numpy" environment -def parse_indices(shape, indices, cyclic=False, numpy_indexing=False): +def parse_indices(shape, indices, cyclic=False, keepdims=True): """TODODASK. :Parameters: @@ -1945,16 +1942,6 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False): parsed_indices.extend([slice(None)] * (ndim - len_parsed_indices)) if not ndim and parsed_indices: - # # If data is scalar then allow it to be indexed with an - # # equivalent to [0] - # if (len_parsed_indices == 1 and - # parsed_indices[0] in (0, - # -1, - # slice(0, 1), - # slice(-1, None, -1), - # slice(None, None, None))): - # parsed_indices = [] - # else: raise IndexError( "Scalar array can only be indexed with () or Ellipsis" ) @@ -2015,7 +2002,7 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False): # -9:0:1 => [1, 2, 3, 4, 5, 6, 7, 8, 9] # -9:1:1 => [1, 2, 3, 4, 5, 6, 7, 8, 9, 0] # -10:0:1 => [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - if cyclic and not numpy_indexing: + if cyclic: index = slice(0, stop - start, step) roll[i] = -start else: @@ -2029,7 +2016,7 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False): # 6:-4:-1 => [6, 5, 4, 3, 2, 1, 0, 9, 8, 7] # 0:-2:-1 => [0, 9] # 0:-10:-1 => [0, 9, 8, 7, 6, 5, 4, 3, 2, 1] - if cyclic and not numpy_indexing: + if cyclic: index = slice(start - stop - 1, None, step) roll[i] = -1 - stop else: @@ -2043,155 +2030,26 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False): or (start > stop and step > 0) ): raise IndexError( - "Invalid indices dimension with size {}: {}".format( - size, index - ) + f"Invalid indices dimension with size {size}: {index}" ) if step < 0 and stop < 0: stop = None index = slice(start, stop, step) - elif isinstance(index, (int, _numpy_integer)): + elif isinstance(index, Integral): # -------------------------------------------------------- # Index is an integer # -------------------------------------------------------- if index < 0: index += size - if not numpy_indexing: + if keepdims: + print("CONVERT") index = slice(index, index + 1, 1) is_slice = True - else: - convert2positve = True - if getattr( - getattr(index, "dtype", None), "kind", None - ) == "b" or isinstance(index[0], bool): - # ---------------------------------------------------- - # Index is a sequence of booleans - # ---------------------------------------------------- - # Convert booleans to non-negative integers. We're - # assuming that anything with a dtype attribute also - # has a size attribute. - if _numpy_size(index) != size: - raise IndexError( - "Incorrect number ({}) of boolean indices for " - "dimension with size {}: {}".format( - _numpy_size(index), size, index - ) - ) - - index = _numpy_where(index)[0] - convert2positve = False - - if not _numpy_ndim(index): - if index < 0: - index += size - - index = slice(index, index + 1, 1) - is_slice = True - else: - len_index = len(index) - if len_index == 1: - index = index[0] - if index < 0: - index += size - - index = slice(index, index + 1, 1) - is_slice = True - elif len_index: - if convert2positve: - # Convert to non-negative integer numpy array - index = _numpy_array(index) - index = _numpy_where(index < 0, index + size, index) - - steps = index[1:] - index[:-1] - step = steps[0] - if step and not (steps - step).any(): - # Replace the numpy array index with a slice - if step > 0: - start, stop = index[0], index[-1] + 1 - elif step < 0: - start, stop = index[0], index[-1] - 1 - - if stop < 0: - stop = None - - index = slice(start, stop, step) - is_slice = True - else: - if ( - (step > 0 and (steps <= 0).any()) - or (step < 0 and (steps >= 0).any()) - or not step - ): - raise ValueError( - "Bad index (not strictly monotonic): " - "{}".format(index) - ) - - # if reverse and step < 0: - # # The array is strictly monotonically - # # decreasing, so reverse it so that it's - # # strictly monotonically increasing. Make - # # a note that this dimension will need - # # flipping later - # index = index[::-1] - # flip.append(i) - # step = -step - # - # if envelope: - # # Create an envelope slice for a parsed - # # index of a numpy array of integers - # compressed_indices.append(index) - # - # step = _numpy_sign(step) - # if step > 0: - # stop = index[-1] + 1 - # else: - # stop = index[-1] - 1 - # if stop < 0: - # stop = None - # - # index = slice(index[0], stop, step) - # is_slice = True - else: - raise IndexError( - "Invalid indices {} for array with shape {}".format( - parsed_indices, shape - ) - ) if is_slice: - # if reverse and index.step < 0: - # # If the slice step is negative, then transform - # # the original slice to a new slice with a - # # positive step such that the result of the new - # # slice is the reverse of the result of the - # # original slice. - # # - # # For example, if the original slice is - # # slice(6,0,-2) then the new slice will be - # # slice(2,7,2): - # # - # # >>> a = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - # # >>> a[slice(6, 0, -2)] - # # [6, 4, 2] - # # >>> a[slice(2, 7, 2)] - # # [2, 4, 6] - # # a[slice(6, 0, -2)] == list(reversed(a[slice(2, 7, 2)])) - # # True - # start, stop, step = index.indices(size) - # step *= -1 - # div, mod = divmod(start-stop-1, step) - # div_step = div*step - # start -= div_step - # stop = start + div_step + 1 - # - # index = slice(start, stop, step) - # flip.append(i) - # # --- End: if - # If step is greater than one then make sure that # index.stop isn't bigger than it needs to be if cyclic and index.step > 1: @@ -2200,15 +2058,6 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False): stop = start + div * step + 1 index = slice(start, stop, step) - # - # if envelope: - # # Create an envelope slice for a parsed - # # index of a numpy array of integers - # compressed_indices.append(index) - # index = slice( - # start, stop, (1 if reverse else _numpy_sign(step))) - # --- End: if - parsed_indices[i] = index # if not (cyclic or reverse or envelope or mask): @@ -2220,15 +2069,6 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False): if cyclic: out.append(roll) - # if reverse: - # out.append(flip) - # - # if envelope: - # out.append(compressed_indices) - # - # if mask: - # out.append(mask_indices) - return out diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 7f60ad8ab9..35b57f7615 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -824,6 +824,102 @@ def test_Data___getitem__(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return + d = cf.Data(np.ma.arange(450).reshape(9, 10, 5)) + + for indices in ( + Ellipsis, + (slice(None), slice(None)), + (slice(None), Ellipsis), + (Ellipsis, slice(None)), + (Ellipsis, slice(None), Ellipsis), + ): + self.assertEqual(d[indices].shape, d.shape) + + for indices in ( + ([1, 3, 4], slice(None), [2, -1]), + (slice(0, 6, 2), slice(None), [2, -1]), + (slice(0, 6, 2), slice(None), slice(2, 5, 2)), + (slice(0, 6, 2), list(range(10)), slice(2, 5, 2)), + ): + self.assertEqual(d[indices].shape, (3, 10, 2)) + + for indices in ( + (slice(0, 6, 2), -2, [2, -1]), + (slice(0, 6, 2), -2, slice(2, 5, 2)), + ): + self.assertEqual(d[indices].shape, (3, 1, 2)) + + for indices in ( + ([1, 3, 4], -2, [2, -1]), + ([4, 3, 1], -2, [2, -1]), + ([1, 4, 3], -2, [2, -1]), + ([4, 1, 4], -2, [2, -1]), + ): + self.assertEqual(d[indices].shape, (3, 1, 2)) + + d.__keepdims_indexing__ = False + for indices in ( + ([1, 3, 4], -2, [2, -1]), + (slice(0, 6, 2), -2, [2, -1]), + (slice(0, 6, 2), -2, slice(2, 5, 2)), + ([1, 4, 3], -2, [2, -1]), + ([4, 3, 4], -2, [2, -1]), + ([1, 4, 4], -2, [2, -1]), + ): + self.assertEqual(d[indices].shape, (3, 2)) + + d.__keepdims_indexing__ = True + + # Cyclic slices + d = cf.Data(np.ma.arange(24).reshape(3, 8)) + d.cyclic(1) + self.assertTrue((d[0, :6].array == [[0, 1, 2, 3, 4, 5]]).all()) + e = d[0, -2:4] + self.assertEqual(e.shape, (1, 6)) + self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3]]).all()) + self.assertFalse(e.cyclic()) + + e = d[0, -2:6] + self.assertEqual(e.shape, (1, 8)) + self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3, 4, 5]]).all()) + self.assertTrue(e.cyclic(), set([1])) + + with self.assertRaises(IndexError): + # Cyclic slice of non-cyclic axis + e = d[-1:1] + + d.cyclic(0) + e = d[-1:1, -2:-4] + self.assertEqual(e.shape, (2, 6)) + self.assertTrue((e[:, 0].array == [[22], [6]]).all()) + self.assertTrue((e[0].array == [[22, 23, 16, 17, 18, 19]]).all()) + self.assertFalse(e.cyclic()) + + e = d[-1:2, -2:4] + self.assertEqual(e.shape, (3, 6)) + self.assertEqual(e.cyclic(), set([0])) + e = d[-1:1, -2:6] + self.assertEqual(e.shape, (2, 8)) + self.assertEqual(e.cyclic(), set([1])) + e = d[-1:2, -2:6] + self.assertEqual(e.shape, (3, 8)) + self.assertEqual(e.cyclic(), set([0, 1])) + + d.cyclic(0, False) + print(d.cyclic()) + d.__keepdims__indexing__ = False + self.assertTrue((d[0, :6].array == [0, 1, 2, 3, 4, 5]).all()) + e = d[0, -2:4] + print(e.cyclic()) + self.assertEqual(e.shape, (6,)) + self.assertTrue((e.array == [6, 7, 0, 1, 2, 3]).all()) + self.assertFalse(e.cyclic()) + d.__keepdims__indexing__ = True + + # Ancillary masks + # TODODASK: Test __getitem__ with ancillary masks. Can only do + # this when cf.Data.where has been daskified + def test_Data___setitem__(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return From 900832e6ee09aab65253b54ee63bb529da39bd79 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 6 Sep 2021 12:29:04 +0100 Subject: [PATCH 02/12] getitem: cyclic axes, unit tests --- cf/data/data.py | 142 ++++++++++++++++++++++++++----------------- cf/test/test_Data.py | 39 ++++++++++-- 2 files changed, 119 insertions(+), 62 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 6d39ea30c3..92d6873154 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -919,7 +919,8 @@ def __getitem__(self, indices): (similar to the way vector subscripts work in Fortran). This is the same behaviour as indexing on a `netCDF4.Variable` object. - . seealso:: `__setitem__`, `_parse_indices` + . seealso:: `__setitem__`, `__keepdims_indexing__`, + `__orthogonal_indexing__` :Returns: @@ -956,6 +957,7 @@ def __getitem__(self, indices): if isinstance(arg, str) and arg == "mask": auxiliary_mask = indices[1] indices = indices[2:] + keepdims = self.__keepdims_indexing__ indices, roll = parse_indices( self.shape, @@ -966,12 +968,13 @@ def __getitem__(self, indices): axes = self._axes cyclic_axes = self._cyclic - print ('parsed_indices=', indices) + # ------------------------------------------------------------ + # Roll axes with cyclic slices + # ------------------------------------------------------------ if roll: - # Roll axes with cyclic slices. For example, if slice(-2, - # 3) has been requested on a cyclic axis, then we roll - # that axis by two points and apply the slice(0, 5) - # instead. + # For example, if slice(-2, 3) has been requested on a + # cyclic axis, then we roll that axis by two points and + # apply the slice(0, 5) instead. if not cyclic_axes.issuperset([axes[i] for i in roll]): raise IndexError( "Can't take a cyclic slice of a non-cyclic axis" @@ -985,7 +988,9 @@ def __getitem__(self, indices): new = self.copy(array=False) dx = self._get_dask() - # Subspace the array + # ------------------------------------------------------------ + # Subspace the dask array + # ------------------------------------------------------------ if self.__orthogonal_indexing__: # Apply 'orthogonal indexing': indices that are 1-d arrays # or lists subspace along each dimension @@ -994,37 +999,40 @@ def __getitem__(self, indices): axes_with_list_indices = [ i for i, x in enumerate(indices) - if not isinstance(x, (slice, Integral)) + if isinstance(x, list) or getattr(x, "shape", False) ] n_axes_with_list_indices = len(axes_with_list_indices) if n_axes_with_list_indices < 2: - # At most one axis has a list-of-integers index so do - # a normal dask subspace + # At most one axis has a list/1-d array index so do a + # normal dask subspace dx = dx[tuple(indices)] - print (indices) else: - # At least two axes have list-of-integers indices so - # we can't do a normal dask subspace + # At least two axes have list/1-d array indices so we + # can't do a normal dask subspace - # Subspace axes which have list-of-integer indices + # Subspace axes which have list/1-d array indices for axis in axes_with_list_indices: dx = da.take(dx, indices[axis], axis=axis) if n_axes_with_list_indices < len(indices): - # Subspace axes which didn't have list-of-integer - # indices. (Do this after subspacing axes which - # have list-of-integer indices in case + # Subspace axes which don't have list/1-d array + # indices. (Do this after subspacing axes which do + # have list/1-d array indices, in case # __keepdims_indexing__ is False.) slice_indices = [ slice(None) if i in axes_with_list_indices else x for i, x in enumerate(indices) ] dx = dx[tuple(slice_indices)] - else: raise ValueError("Orthogonal indexing is currently required.") + # ------------------------------------------------------------ + # Set the subspaced dask array + # ------------------------------------------------------------ + new._set_dask(dx) + # ------------------------------------------------------------ # Note which axes have not been dropped from the result # (i.e. those indexed with non-integral indices) @@ -1033,10 +1041,11 @@ def __getitem__(self, indices): axes_with_non_integral_indices = axes shape = self.shape else: + # Dropped axes can no longer be cyclic axes_with_non_integral_indices = [ axis for axis, x in zip(axes, indices) - if not isinstance(x, Integral) + if not (isinstance(x, Integral) or getattr(x, "shape", True)) ] if len(axes_with_non_integral_indices) < len(indices): # Never change the _axes attribute in-place @@ -1044,38 +1053,41 @@ def __getitem__(self, indices): if cyclic_axes: cyclic_axes = set( - [axis - for axis in cyclic_axes - if axis in axes_with_non_integral_indices] + [ + axis + for axis in cyclic_axes + if axis in axes_with_non_integral_indices + ] ) - shape = [n - for n, axis in zip(self.shape, axes) - if axis in axes_with_non_integral_indices] - - # Set the subspaced dask array - new._set_dask(dx) - - # ------------------------------------------------------------ - # Apply auxiliary masks - # ------------------------------------------------------------ - for mask in auxiliary_mask: - new.where(mask, cf_masked, inplace=True) + if cyclic_axes: + shape = [ + n + for n, axis in zip(self.shape, axes) + if axis in axes_with_non_integral_indices + ] # ------------------------------------------------------------ - # Cyclic axes + # Cyclic axes that have been reduced in size are no longer + # considered to be cyclic # ------------------------------------------------------------ if cyclic_axes: - # Cyclic axes that have been reduced in size are no longer - # considered to be cyclic - x = [axis - for axis, n0, n1 in zip(axes_with_non_integral_indices, - shape, - new.shape) - if n1 != n0 and axis in cyclic_axes] + x = [ + axis + for axis, n0, n1 in zip( + axes_with_non_integral_indices, shape, new.shape + ) + if n1 != n0 and axis in cyclic_axes + ] if x: # Never change the _cyclic attribute in-place new._cyclic = cyclic_axes.difference(x) + # ------------------------------------------------------------ + # Apply auxiliary masks + # ------------------------------------------------------------ + for mask in auxiliary_mask: + new.where(mask, cf_masked, None, inplace=True) + return new @daskified(1) @@ -1187,14 +1199,23 @@ def __orthogonal_indexing__(self): """Flag to indicate that orthogonal indexing is supported. Always True, indicating that 'orthogonal indexing' is - applied. This means that indices that are 1-d arrays or lists - subspace along each dimension independently. This behaviour is - similar to Fortran, but different to `numpy`. + applied. This means that when indices are 1-d arrays or lists + then they subspace along each dimension independently. This + behaviour is similar to Fortran, but different to `numpy`. .. versionadded:: TODODASK - .. seealso:: __keepdims_indexing__, - netCDF4.Variable.__orthogonal_indexing__ + .. seealso:: `__keepdims_indexing__`, `__getitem__`, + `__setitem__`, + `netCDF4.Variable.__orthogonal_indexing__` + + **Examples:** + + >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) + >>> d[[0], [0, 2]].shape + (1, 2) + >>> d[[0, 1], [0, 2]].shape + (2, 2) """ return True @@ -1204,27 +1225,36 @@ def __orthogonal_indexing__(self): def __keepdims_indexing__(self): """Flag to indicate whether orthogonal indexing is supported. - If True then providing a single integer as a single-axis index - does *not* reduce the number of array dimensions by 1. This - behaviour is different to `numpy`. + If set to True (the default) then providing a single integer + as a single-axis index does *not* reduce the number of array + dimensions by 1. This behaviour is different to `numpy`. - If False then providing a single integer as a single-axis - index reduces the number of array dimensions by 1. This - behaviour is the same as `numpy`. + If set to False then providing a single integer as a + single-axis index reduces the number of array dimensions by + 1. This behaviour is the same as `numpy`. .. versionadded:: TODODASK - .. seealso:: __orthogonal_indexing__ + .. seealso:: `__orthogonal_indexing__`, `__getitem__`, + `__setitem__` **Examples:** - >>> d = cf.Data([[1, 2], [3, 4]]) + >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) >>> d.__keepdims_indexing__ = True >>> d[0].shape - (1, 2) + (1, 3) + >>> d[:, 1].shape + (2, 1) + >>> d[0, 1].shape + (1, 1) >>> d.__keepdims_indexing__ = False >>> d[0].shape + (3,) + >>> d[:, 1].shape (2,) + >>> d[0, 1].shape + () """ return self._custom.get("__keepdims_indexing__", True) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 35b57f7615..c2da15cb7a 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -820,7 +820,7 @@ def test_Data_squeeze_insert_dimension(self): a = f.array self.assertTrue(np.allclose(a, array)) - def test_Data___getitem__(self): + def test_Data__getitem__(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -858,6 +858,7 @@ def test_Data___getitem__(self): self.assertEqual(d[indices].shape, (3, 1, 2)) d.__keepdims_indexing__ = False + self.assertFalse(d.__keepdims_indexing__) for indices in ( ([1, 3, 4], -2, [2, -1]), (slice(0, 6, 2), -2, [2, -1]), @@ -868,13 +869,20 @@ def test_Data___getitem__(self): ): self.assertEqual(d[indices].shape, (3, 2)) + self.assertFalse(d.__keepdims_indexing__) d.__keepdims_indexing__ = True + self.assertTrue(d.__keepdims_indexing__) + + d = cf.Data(np.ma.arange(24).reshape(3, 8)) + e = d[0, 2:4] + print(e.cyclic(), e.shape) # Cyclic slices d = cf.Data(np.ma.arange(24).reshape(3, 8)) d.cyclic(1) self.assertTrue((d[0, :6].array == [[0, 1, 2, 3, 4, 5]]).all()) e = d[0, -2:4] + print(e.cyclic(), e.shape) self.assertEqual(e.shape, (1, 6)) self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3]]).all()) self.assertFalse(e.cyclic()) @@ -906,17 +914,36 @@ def test_Data___getitem__(self): self.assertEqual(e.cyclic(), set([0, 1])) d.cyclic(0, False) - print(d.cyclic()) - d.__keepdims__indexing__ = False - self.assertTrue((d[0, :6].array == [0, 1, 2, 3, 4, 5]).all()) + d.__keepdims_indexing__ = False + e = d[0, :6] + self.assertFalse(e.__keepdims_indexing__) + self.assertEqual(e.shape, (6,)) + self.assertTrue((e.array == [0, 1, 2, 3, 4, 5]).all()) + print("----") e = d[0, -2:4] - print(e.cyclic()) self.assertEqual(e.shape, (6,)) self.assertTrue((e.array == [6, 7, 0, 1, 2, 3]).all()) self.assertFalse(e.cyclic()) - d.__keepdims__indexing__ = True + d.__keepdims_indexing__ = True + + # Keepdims indexing + d = cf.Data([[1, 2, 3], [4, 5, 6]]) + self.assertEqual(d[0].shape, (1, 3)) + self.assertEqual(d[:, 1].shape, (2, 1)) + self.assertEqual(d[0, 1].shape, (1, 1)) + d.__keepdims_indexing__ = False + self.assertEqual(d[0].shape, (3,)) + self.assertEqual(d[:, 1].shape, (2,)) + self.assertEqual(d[0, 1].shape, ()) + d.__keepdims_indexing__ = True + + # Orthogonal indexing + self.assertEqual(d[[0], [0, 2]].shape, (1, 2)) + self.assertEqual(d[[0, 1], [0, 2]].shape, (2, 2)) + self.assertEqual(d[[0, 1], [2]].shape, (2, 1)) # Ancillary masks + # # TODODASK: Test __getitem__ with ancillary masks. Can only do # this when cf.Data.where has been daskified From 336bd687385ef4a4698f87d817387ce850c02b9a Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 6 Sep 2021 12:32:20 +0100 Subject: [PATCH 03/12] remove debugging print statements --- cf/functions.py | 1 - cf/test/test_Data.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/cf/functions.py b/cf/functions.py index d9c46e1872..a6f16a2cb1 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -2045,7 +2045,6 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True): index += size if keepdims: - print("CONVERT") index = slice(index, index + 1, 1) is_slice = True diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index c2da15cb7a..eb7a052b5c 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -875,14 +875,12 @@ def test_Data__getitem__(self): d = cf.Data(np.ma.arange(24).reshape(3, 8)) e = d[0, 2:4] - print(e.cyclic(), e.shape) # Cyclic slices d = cf.Data(np.ma.arange(24).reshape(3, 8)) d.cyclic(1) self.assertTrue((d[0, :6].array == [[0, 1, 2, 3, 4, 5]]).all()) e = d[0, -2:4] - print(e.cyclic(), e.shape) self.assertEqual(e.shape, (1, 6)) self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3]]).all()) self.assertFalse(e.cyclic()) @@ -919,7 +917,6 @@ def test_Data__getitem__(self): self.assertFalse(e.__keepdims_indexing__) self.assertEqual(e.shape, (6,)) self.assertTrue((e.array == [0, 1, 2, 3, 4, 5]).all()) - print("----") e = d[0, -2:4] self.assertEqual(e.shape, (6,)) self.assertTrue((e.array == [6, 7, 0, 1, 2, 3]).all()) From 1bfe9fe127199cd838581d84b871005be9845aed Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 6 Sep 2021 17:26:29 +0100 Subject: [PATCH 04/12] setitem --- cf/data/data.py | 111 ++++++++++++++++++++++--------------------- cf/data/utils.py | 75 +++++++++++++++++++++++++++-- cf/functions.py | 32 +++++-------- cf/test/test_Data.py | 36 ++++++++++---- 4 files changed, 166 insertions(+), 88 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 92d6873154..4a3f8a74cf 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -105,6 +105,7 @@ from .partition import Partition from .partitionmatrix import PartitionMatrix from .utils import ( # is_small,; is_very_small, + conform_units, convert_to_datetime, convert_to_reftime, dask_compatible, @@ -968,6 +969,7 @@ def __getitem__(self, indices): axes = self._axes cyclic_axes = self._cyclic + # ------------------------------------------------------------ # Roll axes with cyclic slices # ------------------------------------------------------------ @@ -1102,6 +1104,12 @@ def __setitem__(self, indices, value): a subspace. See `__getitem__` for details on how to define subspace of the data array. + .. note:: Currently at most one dimension's assignment index + may be a 1-d array of integers or booleans. This is + is different to `__getitem__`, which applies + 'orthogonal indexing' when multiple indices of 1-d + array of integers or booleans are present. + **Missing data** The treatment of missing data elements during assignment to a @@ -1116,77 +1124,69 @@ def __setitem__(self, indices, value): the `cf.masked` constant or by assignment to a value which contains masked elements. - .. seealso:: `cf.masked`, `hardmask`, `where` + .. seealso:: `__getitem__`, `cf.masked`, `hardmask`, `where` **Examples:** """ - # TODODASK - sort out the "numpy" environment - indices, roll = parse_indices( self.shape, indices, cyclic=True, keepdims=True ) indices = tuple(indices) axes_with_list_indices = [ - True for x in indices if not isinstance(x, slice) + i + for i, x in enumerate(indices) + if isinstance(x, list) or getattr(x, "shape", False) ] if len(axes_with_list_indices) > 1: - raise IndexError( + raise NotImplementedError( "Currently limited to at most one dimension's assignment " "index being a 1-d array of integers or booleans. " f"Got: {indices}" ) + # TODODASK: The inherited algorithm that does assignment + # for multiple list/1-d array indices + # (cfdm.Data._set_subspace) won't work when the + # 1-d array is a dask array because the array + # may need to be computed at __setitem__ + # runtime, which is not desirable. Until this + # can be fixed, it's easiest to disallow this + # case, that was allowed pre-dask. + # ------------------------------------------------------------ + # Roll axes with cyclic slices + # ------------------------------------------------------------ if roll: - # Roll axes with cyclic slices. For example, if assigning - # to slice(-2, 3) has been requested on a cyclic axis (and - # we're not using numpy indexing), then we roll that axis - # by two points and assign to slice(0, 5) instead. The - # axis is then unrolled by two points afer the assignment - # has been made. + # For example, if assigning to slice(-2, 3) has been + # requested on a cyclic axis (and we're not using numpy + # indexing), then we roll that axis by two points and + # assign to slice(0, 5) instead. The axis is then unrolled + # by two points afer the assignment has been made. axes = self._axes - if self._cyclic.intersection([axes[i] for i in roll]): + if not self._cyclic.issuperset([axes[i] for i in roll]): raise IndexError( "Can't do a cyclic assignment to a non-cyclic axis" ) roll_axes = tuple(roll.keys()) shifts = tuple(roll.values()) - self.roll(axis=roll_axes, shift=shifts, inplace=True) - - # TODODASK: multiple lists + self.roll(shift=shifts, axis=roll_axes, inplace=True) # Make sure that the units of value are the same as self - try: - value_units = value.Units - except AttributeError: - pass - else: - self_units = self.Units - if value_units.equivalent(self_units): - if value_units != self_units: - value = value.copy() - value.Units = self.Units - elif value_units and self_units: - raise ValueError( - f"Can't assign values with units {value_units!r} " - f"to data with units {self_units!r}" - ) - - # Set the mask hardness, in case any previous operations have - # inadvertently changed it. - self._set_mask_hardness() + value = conform_units(value, self.Units) # Do the assignment dx = self._get_dask() dx[indices] = dask_compatible(value) + # ------------------------------------------------------------ + # Unroll any axes that were rolled to enable a cyclic + # assignment + # ------------------------------------------------------------ if roll: - # Unroll any axes that were rolled to enable a cyclic - # assignment shifts = [-shift for shift in shifts] - self.roll(axis=roll_axes, shift=shifts, inplace=True) + self.roll(shift=shifts, axis=roll_axes, inplace=True) return @@ -5847,8 +5847,6 @@ def hardmask(self): changes the `hardmask` attribute to `False`. The mask can be re-hardened with `harden_mask` method. - The following operations - .. seealso:: `harden_mask`, `soften_mask`, `where` **Examples:** @@ -5897,7 +5895,7 @@ def is_masked(self): **Performance** - `is_masked` causes all delayed operations to be computed. + `is_masked` causes all delayed operations to be executed. **Examples:** @@ -5909,6 +5907,13 @@ def is_masked(self): True """ + # TODODASK: (General point) When we run something that + # executes all of the lazy operations (like + # is_masked), should/could we replace the dask array + # of self with a "persisted" version of the computed + # data? If we did this, we would want to have the + # ability to cache persisted chunks to disk, as they + # came into being on each thread. def is_masked(a): out = np.ma.is_masked(a) @@ -5916,7 +5921,7 @@ def is_masked(a): dx = self._get_dask() - out_ind = tuple(dx.ndim) + out_ind = tuple(range(dx.ndim)) dx_ind = out_ind dx = da.blockwise( @@ -6111,7 +6116,6 @@ def array(self): a.harden_mask() else: a.soften_mask() - # --- End: if return a @@ -12660,6 +12664,14 @@ def roll(self, axis, shift, inplace=False, i=False): *Parameter example:* Convolve the last axis: ``axis=-1``. + shift: `int`, or `tuple` of `int` + The number of places by which elements are shifted. + If a `tuple`, then *axis* must be a tuple of the same + size, and each of the given axes is shifted by the + corresponding number. If an `int` while *axis* is a + tuple of `int`, then the same value is used for all + given axes. + {{inplace: `bool`, optional}} {{i: deprecated at version 3.0.0}} @@ -12669,21 +12681,12 @@ def roll(self, axis, shift, inplace=False, i=False): `Data` or `None` """ - d = _inplace_enabled_define_and_cleanup(self) - - if isinstance(shift, int): - shift = (shift,) + # TODODASK - consider matching the numpy/dask api: "shift, axis=" - axes = self._parse_axes(axis) - if len(axes) != len(shift): - raise ValueError("TODODASK") - - if not shift: - # Null roll - return d + d = _inplace_enabled_define_and_cleanup(self) dx = d._get_dask() - dx = da.roll(dx, axis=axes, shift=shift) + dx = da.roll(dx, shift, axis=axis) d._set_dask(dx) return d diff --git a/cf/data/utils.py b/cf/data/utils.py index d7a150eba1..ca1584ba85 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -1,5 +1,5 @@ """General functions useful for `Data` functionality.""" -from functools import partial +from functools import lru_cache, partial from itertools import product import numpy as np @@ -93,7 +93,6 @@ def convert_to_reftime(array, units, first_value=None): ) else: d_calendar = x_calendar - # --- End: if if not units: # Set the units to something that is (hopefully) @@ -210,12 +209,13 @@ def _get_calendar(x): return set(cals.tolist()) +@lru_cache(maxsize=32) def new_axis_identifier(existing_axes=(), basename="dim"): - """Return a new, unique axis identifiers. + """Return a new, unique axis identifier. The name is arbitrary and has no semantic meaning. - .. versionadded:: 4.0.0 + .. versionadded:: TODODASK :Parameters: @@ -365,6 +365,71 @@ def dask_compatible(a): """ try: - return a.data._get_data() + return a.data._get_dask() except AttributeError: return a + + +def conform_units(value, units): + """Conform units. + + If *value* has units defined by its `Units` attribute then + + * if the value units are equal to *units* then *value* is returned + unchanged; + + * if the value units are equivalent to *units* then a copy of + *value* with the same units as data* is returned; + + * if the value units are not equivalent to *units* then an + exception is raised. + + In all other cases *value* is returned unchanged. + + .. versionadded:: TODODASK + + :Parameters: + + value: + The value whose units are to be conformed to *units*. + + units: `Units` + The units to conform to. + + **Examples:** + + >>> conform_units(1, cf.Units('metres')) + 1 + >>> conform_units([1, 2, 3], cf.Units('metres')) + [1, 2, 3] + >>> import numpy + >>> conform_units(numpy.array([1, 2, 3]), cf.Units('metres')) + array([1, 2, 3]) + >>> conform_units('string', cf.Units('metres')) + 'string' + >>> d = cf.Data([1, 2] , 'm') + >>> conform_units(d, cf.Units('metres')) + + >>> d = cf.Data([1, 2] , 'km') + >>> conform_units(d, cf.Units('metres')) + + >>> conform_units(d, cf.Units('s')) + ... + ValueError: Units are incompatible with units + + """ + try: + value_units = value.Units + except AttributeError: + pass + else: + if value_units.equivalent(units): + if value_units != units: + value = value.copy() + value.Units = units + elif value_units and units: + raise ValueError( + f"Units {value_units!r} are incompatible with units {units!r}" + ) + + return value diff --git a/cf/functions.py b/cf/functions.py index a6f16a2cb1..c8e48540e0 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -1886,33 +1886,31 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True): indices: `tuple` (not a `list`!) + keepdims: `bool`, optional + If True then an integral index is converted to a + slice. For instance, ``3`` would become ``slice(3, 4)``. + :Returns: `list` [, `dict`] + The parsed indices. If *cyclic* is True the a dictionary + is also returned that contains the parameters needed to + interpret any cyclic slices. **Examples:** >>> cf.parse_indices((5, 8), ([1, 2, 4, 6],)) [array([1, 2, 4, 6]), slice(0, 8, 1)] - >>> cf.parse_indices((5, 8), ([2, 4, 6],)) - [slice(2, 7, 2), slice(0, 8, 1)] + >>> cf.parse_indices((5, 8), (Ellipsis, [2, 4, 6])) + [slice(0, 5, 1), slice(2, 7, 2)] """ parsed_indices = [] roll = {} - flip = [] - compressed_indices = [] - mask_indices = [] if not isinstance(indices, tuple): indices = (indices,) - # if mask and indices: - # arg0 = indices[0] - # if isinstance(arg0, str) and arg0 == 'mask': - # mask_indices = indices[1] - # indices = indices[2:] - # Initialize the list of parsed indices as the input indices with any # Ellipsis objects expanded length = len(indices) @@ -1933,9 +1931,7 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True): if ndim and len_parsed_indices > ndim: raise IndexError( - "Invalid indices {} for array with shape {}".format( - parsed_indices, shape - ) + f"Invalid indices {parsed_indices} for array with shape {shape}" ) if len_parsed_indices < ndim: @@ -2059,16 +2055,10 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True): parsed_indices[i] = index - # if not (cyclic or reverse or envelope or mask): if not cyclic: return parsed_indices - out = [parsed_indices] - - if cyclic: - out.append(roll) - - return out + return parsed_indices, roll def get_subspace(array, indices): diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index eb7a052b5c..b0c8afac6a 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -955,11 +955,10 @@ def test_Data___setitem__(self): else: a.soften_mask() - d = cf.Data(a, "m", chunks=(2, 2), hardmask=hardmask) + d = cf.Data(a.copy(), "metres", hardmask=hardmask) - # Scalar assignment - a[:, 1] = np.ma.masked - d[:, 1] = np.ma.masked + a[:, 1] = cf.masked + d[:, 1] = cf.masked a[0, 2] = -6 d[0, 2] = -6 @@ -982,16 +981,37 @@ def test_Data___setitem__(self): a[8, [8, 6, 5]] = -5 d[8, [8, 6, 5]] = -5 - self.assertTrue((d.array == a).all()) - self.assertTrue((d.array.mask == a.mask).all()) + a[...] = -a + d[...] = -d - # Non-scalar assignment (TODODASK - add more tests) a[0] = a[2] - d[0] = a[2] + d[0] = d[2] self.assertTrue((d.array == a).all()) self.assertTrue((d.array.mask == a.mask).all()) + # Units + a = np.ma.arange(90).reshape(9, 10) + d = cf.Data(a, "metres") + d[...] = cf.Data(a * 100, "cm") + self.assertTrue((d.array == a).all()) + self.assertTrue((d.array.mask == a.mask).all()) + + # Cyclic + d.cyclic(1) + self.assertTrue((d[0].array == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).all()) + d[0, -1:1] = [-99, -1] + self.assertTrue( + (d[0].array == [-1, 1, 2, 3, 4, 5, 6, 7, 8, -99]).all() + ) + self.assertEqual(d.cyclic(), set([1])) + + with self.assertRaises(NotImplementedError): + d[[1, 2], [0, 4, 1]] = 9 + + with self.assertRaises(NotImplementedError): + d[[1], [0, 4, 1]] = 9 + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_outerproduct(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: From 8f07e1fbdeab7b3434b1ef2c2994cba52fbbec3b Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 8 Sep 2021 18:26:48 +0100 Subject: [PATCH 05/12] hardmask, unknown shape --- cf/data/data.py | 392 ++++++++++++++++++++++++++----------------- cf/field.py | 10 +- cf/functions.py | 60 ++----- cf/test/test_Data.py | 26 ++- 4 files changed, 274 insertions(+), 214 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 4a3f8a74cf..a18cdb243e 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1,4 +1,5 @@ import logging +import math import operator from functools import partial, reduce, wraps from itertools import product @@ -113,6 +114,11 @@ new_axis_identifier, ) +# from .chunk_utils import ( # is_small,; is_very_small, +# harden_mask_chunk, +# soften_mask_chunk, +# ) + # from dask.array import Array @@ -300,8 +306,7 @@ def __init__( array: optional The array of values. May be any scalar or array-like - object, including another `Data` instance. Ignored if the - *source* parameter is set. + object, including another `Data` instance. *Parameter example:* ``array=[34.6]`` @@ -314,8 +319,7 @@ def __init__( units: `str` or `Units`, optional The physical units of the data. if a `Units` object is - provided then this an also set the calendar. Ignored if - the *source* parameter is set. + provided then this an also set the calendar. The units (without the calendar) may also be set after initialisation with the `set_units` method. @@ -327,8 +331,7 @@ def __init__( ``units='days since 2018-12-01'`` calendar: `str`, optional - The calendar for reference time units. Ignored if the - *source* parameter is set. + The calendar for reference time units. The calendar may also be set after initialisation with the `set_calendar` method. @@ -340,8 +343,7 @@ def __init__( The fill value of the data. By default, or if set to `None`, the `numpy` fill value appropriate to the array's data-type will be used (see - `numpy.ma.default_fill_value`). Ignored if the *source* - parameter is set. + `numpy.ma.default_fill_value`). The fill value may also be set after initialisation with the `set_fill_value` method. @@ -351,7 +353,8 @@ def __init__( dtype: data-type, optional The desired data-type for the data. By default the - data-type will be inferred form the *array* parameter. + data-type will be inferred form the *array* + parameter. The data-type may also be set after initialisation with the `dtype` attribute. @@ -369,11 +372,12 @@ def __init__( mask: optional Apply this mask to the data given by the *array* - parameter. By default, or if *mask* is `None`, no mask is - applied. May be any scalar or array-like object (such as a - `list`, `numpy` array or `Data` instance) that is - broadcastable to the shape of *array*. Masking will be - carried out where the mask elements evaluate to `True`. + parameter. By default, or if *mask* is `None`, no mask + is applied. May be any scalar or array-like object + (such as a `list`, `numpy` array or `Data` instance) + that is broadcastable to the shape of *array*. Masking + will be carried out where the mask elements evaluate + to `True`. This mask will applied in addition to any mask already defined by the *array* parameter. @@ -381,8 +385,10 @@ def __init__( .. versionadded:: 3.0.5 source: optional - Initialize the array, units, calendar and fill value from - those of *source*. + Initialize the data values and metadata (such as + units, mask hardness, etc.) from the data of + *source*. All other arguments, with the exception of + *copy*, are ignored. hardmask: `bool`, optional If False then the mask is soft. By default the mask is @@ -460,7 +466,6 @@ def __init__( "Can't set the 'source' and 'loads' parameters " "at the same time" ) - # --- End: if if source is not None: super().__init__(source=source, _use_array=_use_array) @@ -474,6 +479,11 @@ def __init__( else: self._del_dask(None) + # Note the mask hardness. It is safe to assume that if a + # dask array has been set, then it's mask hardness will be + # already baked into each chunk. + self._hardmask = getattr(source, "hardmask", _DEFAULT_HARDMASK) + return super().__init__( @@ -483,7 +493,8 @@ def __init__( # Create the _HDF_chunks attribute: defines HDF chunking when # writing to disk. # - # Never change the _HDF_chunks attribute in-place. + # Never change the value of the _HDF_chunks attribute + # in-place. self._HDF_chunks = None if loadd is not None: @@ -498,7 +509,12 @@ def __init__( units = Units(units, calendar=calendar) self._Units = units - # Set the mask hardness + # Note the mask hardness. This only records what we want the + # mask hardness to be, and is required in case this + # initialization does not set an array (i.e. array is None or + # _use_array is False). If a dask array is actually set later + # on, then the mask hardness will be set properly, i.e. it + # will be baked into each chunk. self._hardmask = hardmask if array is None: @@ -515,7 +531,7 @@ def __init__( # is removed from _axes then it must also be removed from # _cyclic. # - # Never change the _cyclic attribute in-place. + # Never change the value of the _cyclic attribute in-place. self._cyclic = _empty_set # Create the _axes attribute: an ordered sequence of unique @@ -560,7 +576,6 @@ def __init__( first_value = first_non_missing_value(array) if first_value is not None: dt = hasattr(first_value, "timetuple") - # --- End: if # Convert string or object date-times to floating point # reference times @@ -572,6 +587,9 @@ def __init__( # Store the dask array self._set_dask(array, delete_source=False) + # Set the mask hardness on each chunk. + self.hardmask = hardmask + # Override the data type if dtype is not None: self.dtype = dtype @@ -579,10 +597,6 @@ def __init__( # Apply a mask if mask is not None: self.where(mask, cf_masked, inplace=True) - else: - # Set the mask hardness (which is otherwise set by the - # 'where' method) - self._set_mask_hardness() @property def dask_array(self): @@ -622,17 +636,17 @@ def __contains__(self, value): **Performance** - All delayed operations are computed, and the operation does - not short-circuit once the first occurrence is found. + All delayed operations are exectued, and there is no + short-circuit once the first occurrence is found. **Examples:** - >>> d = Data([[0.0, 1, 2], [3, 4, 5]], 'm') + >>> d = cf.Data([[0.0, 1, 2], [3, 4, 5]], 'm') >>> 4 in d True - >>> Data(3) in d + >>> cf.Data(3) in d True - >>> Data([2.5], units='2 m') in d + >>> cf.Data([2.5], units='2 m') in d True >>> [[2]] in d True @@ -661,7 +675,7 @@ def contains_chunk(a, value): dx = self._get_dask() - out_ind = tuple(dx.ndim) + out_ind = tuple(range(dx.ndim)) dx_ind = out_ind dx = da.blockwise( @@ -920,6 +934,11 @@ def __getitem__(self, indices): (similar to the way vector subscripts work in Fortran). This is the same behaviour as indexing on a `netCDF4.Variable` object. + **Performance** + + If the shape of the data is unknown then it is calculated + immediately by exectuting all delayed operations. + . seealso:: `__setitem__`, `__keepdims_indexing__`, `__orthogonal_indexing__` @@ -959,9 +978,11 @@ def __getitem__(self, indices): auxiliary_mask = indices[1] indices = indices[2:] + shape = self.shape keepdims = self.__keepdims_indexing__ + indices, roll = parse_indices( - self.shape, + shape, indices, cyclic=True, keepdims=keepdims, @@ -1036,37 +1057,24 @@ def __getitem__(self, indices): new._set_dask(dx) # ------------------------------------------------------------ - # Note which axes have not been dropped from the result - # (i.e. those indexed with non-integral indices) + # Set the axis identifiers for the subspace # ------------------------------------------------------------ + shape0 = shape if keepdims: - axes_with_non_integral_indices = axes - shape = self.shape + new_axes = axes else: - # Dropped axes can no longer be cyclic - axes_with_non_integral_indices = [ + new_axes = [ axis for axis, x in zip(axes, indices) - if not (isinstance(x, Integral) or getattr(x, "shape", True)) + if not isinstance(x, Integral) and getattr(x, "shape", True) ] - if len(axes_with_non_integral_indices) < len(indices): - # Never change the _axes attribute in-place - new._axes = axes_with_non_integral_indices - + if new_axes != axes: + new._axes = new_axes + cyclic_axes = new._cyclic if cyclic_axes: - cyclic_axes = set( - [ - axis - for axis in cyclic_axes - if axis in axes_with_non_integral_indices - ] - ) - if cyclic_axes: - shape = [ - n - for n, axis in zip(self.shape, axes) - if axis in axes_with_non_integral_indices - ] + shape0 = [ + n for n, axis in zip(shape, axes) if axis in new_axes + ] # ------------------------------------------------------------ # Cyclic axes that have been reduced in size are no longer @@ -1075,13 +1083,12 @@ def __getitem__(self, indices): if cyclic_axes: x = [ axis - for axis, n0, n1 in zip( - axes_with_non_integral_indices, shape, new.shape - ) - if n1 != n0 and axis in cyclic_axes + for axis, n0, n1 in zip(new_axes, shape0, new.shape) + if axis in cyclic_axes and n0 != n1 ] if x: - # Never change the _cyclic attribute in-place + # Never change the value of the _cyclic attribute + # in-place new._cyclic = cyclic_axes.difference(x) # ------------------------------------------------------------ @@ -1124,6 +1131,11 @@ def __setitem__(self, indices, value): the `cf.masked` constant or by assignment to a value which contains masked elements. + **Performance** + + If the shape of the data is unknown then it is calculated + immediately by executing all delayed operations. + .. seealso:: `__getitem__`, `cf.masked`, `hardmask`, `where` **Examples:** @@ -1154,9 +1166,7 @@ def __setitem__(self, indices, value): # can be fixed, it's easiest to disallow this # case, that was allowed pre-dask. - # ------------------------------------------------------------ # Roll axes with cyclic slices - # ------------------------------------------------------------ if roll: # For example, if assigning to slice(-2, 3) has been # requested on a cyclic axis (and we're not using numpy @@ -1180,14 +1190,17 @@ def __setitem__(self, indices, value): dx = self._get_dask() dx[indices] = dask_compatible(value) - # ------------------------------------------------------------ # Unroll any axes that were rolled to enable a cyclic # assignment - # ------------------------------------------------------------ if roll: shifts = [-shift for shift in shifts] self.roll(shift=shifts, axis=roll_axes, inplace=True) + # Reset the mask hardness, otherwise it could be incorrect in + # the case that a chunk that was not a masked array is + # assigned missing values. + self.hardmask = self.hardmask + return # ---------------------------------------------------------------- @@ -1269,7 +1282,9 @@ def __keepdims_indexing__(self, value): def _get_dask(self): """Get the dask array. - .. versionadded:: 4.0.0 + .. versionadded:: TODODASK + + .. seealso:: `_set_dask`, `_del_dask` :Returns: @@ -1282,7 +1297,9 @@ def _get_dask(self): def _set_dask(self, array, copy=False, delete_source=True): """Set the dask array. - .. versionadded:: 4.0.0 + .. versionadded:: TODODASK + + .. seealso:: `_get_dask`, `_del_dask` :Parameters: @@ -1314,7 +1331,9 @@ def _set_dask(self, array, copy=False, delete_source=True): def _del_dask(self, default=ValueError(), delete_source=True): """Remove the dask array. - .. versionadded:: 4.0.0 + .. versionadded:: TODODASK + + .. seealso:: `_set_dask`, `_get_dask` :Parameters: @@ -1364,10 +1383,9 @@ def _del_dask(self, default=ValueError(), delete_source=True): return out def _dask_map_blocks(self, func, **kwargs): - """Apply a function in-place to the dask array using - `map_blocks`. + """Apply a function to the dask array using `map_blocks`. - .. versionadded:: 4.0.0 + .. versionadded:: TODODASK :Parameters: @@ -1376,8 +1394,8 @@ def _dask_map_blocks(self, func, **kwargs): array. kwargs: optional - Keyword arguments passed to the map `map_blocks` - method of the dask array. + Keyword arguments passed to the + `dask.Array.map_blocks` method of the dask array. :Returns: @@ -2459,14 +2477,14 @@ def loadd(self, d, chunk=True): self._size = reduce(mul, shape, 1) cyclic = d.get("_cyclic", None) - # Never change the _cyclic attribute in-place + # Never change the value of the _cyclic attribute in-place if cyclic: self._cyclic = cyclic.copy() else: self._cyclic = _empty_set HDF_chunks = d.get("_HDF_chunks", None) - # Never change the _HDF_chunks attribute in-place + # Never change the value of the _HDF_chunks attribute in-place if HDF_chunks: self._HDF_chunks = HDF_chunks.copy() else: @@ -5578,13 +5596,12 @@ def _Units(self): def _cyclic(self): """Storage for axis cyclicity. - Identifies which axes are cyclic (and therefore allow cyclic - slicing). - - It must be a subset of the axis identifiers given by the - `_axes` attribute. + Contains a `set` that identifies which axes are cyclic (and + therefore allow cyclic slicing). The set contains a subset of + the axis identifiers defined by the `_axes` attribute. - .. warning:: Never change the `_cyclic` attribute in-place. + .. warning:: Never change the value of the `_cyclic` attribute + in-place. .. note:: When an axis identifier is removed from the `_axes` attribute then it is automatically also removed from @@ -5619,8 +5636,14 @@ def _HDF_chunks(self): del self._custom["_HDF_chunks"] @property + @daskified(1) def _hardmask(self): - """TODODASK.""" + """Storage for the mask hardness. + + Contains a `bool`, where `True` denotes a hard mask and `False` + denotes a soft mask. + + """ return self._custom["_hardmask"] @_hardmask.setter @@ -5636,26 +5659,24 @@ def _hardmask(self): def _axes(self): """Storage for the axis identifiers. - Contains an ordered sequence of identifiers for each array - axis. + Contains a `tuple` of identifiers, one for each array axis. .. note:: When the axis identifiers are reset, then any axis identifier named by the `_cyclic` attribute which is not in the new `_axes` set is automatically removed - `_cyclic` attribute. + from the `_cyclic` attribute. """ return self._custom["_axes"] @_axes.setter def _axes(self, value): - value = tuple(value) - self._custom["_axes"] = value + self._custom["_axes"] = tuple(value) - # Update cyclic: Remove cyclic axes that are not in the new - # axes (never update _cyclic in-place) + # Remove cyclic axes that are not in the new axes cyclic = self._cyclic if cyclic: + # Never change the value of the _cyclic attribute in-place self._cyclic = cyclic.intersection(value) # ---------------------------------------------------------------- @@ -5799,7 +5820,8 @@ def dtype(self): def dtype(self, value): dx = self._get_dask() - # Only change the datatype if it's different + # Only change the datatype if it's different to that of the + # dask array if dx.dtype != value: dx = dx.astype(value) self._set_dask(dx) @@ -5841,11 +5863,15 @@ def hardmask(self): If the `hardmask` attribute is `True`, i.e. there is a hard mask, then unmasking an entry will silently not occur. This - feature prevents overwriting the mask. To allow the unmasking - of an entries when the data has a hard mask, the mask must - first to be softened using the `soften_mask` method, that - changes the `hardmask` attribute to `False`. The mask can be - re-hardened with `harden_mask` method. + feature prevents overwriting the mask. + + To allow the unmasking of an entries when the data has a hard + mask, the mask must first to be softened, either by setting + the `hardmask` attribute to False or equivalently with the + `soften_mask` method. + + The mask can be hardened by setting the `hardmask` attribute + to True or equivalently with the `harden_mask` method. .. seealso:: `harden_mask`, `soften_mask`, `where` @@ -5882,11 +5908,14 @@ def hardmask(self): [9 9 9] """ - return self._custom["_hardmask"] + return self._hardmask @hardmask.setter def hardmask(self, value): - raise AttributeError("TODODASK - use harden_mask/soften_mask instead") + if value: + self.harden_mask() + else: + self.soften_mask() @property @daskified(1) @@ -5961,6 +5990,11 @@ def nbytes(self): Does not include bytes consumed by the array mask + **Performance** + + If the number of bytes is unknown then it is calculated + immediately by executing all delayed operations. + **Examples:** >>> d = cf.Data([[1, 1.5, 2]]) @@ -5978,9 +6012,13 @@ def nbytes(self): """ dx = self._get_dask() - return dx.nbytes + if math.isnan(dx.size): + logger.warning( + "Computing data nbytes: Performance may be degraded" + ) + dx.compute_chunk_sizes() - # TODODASK - what about nans (e.g. after da.unique) + return dx.nbytes @property @daskified(1) @@ -6018,6 +6056,11 @@ def ndim(self): def shape(self): """Tuple of the data array's dimension sizes. + **Performance** + + If the shape of the data is unknown then it is calculated + immediately by executing all delayed operations. + **Examples:** >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) @@ -6038,16 +6081,22 @@ def shape(self): """ dx = self._get_dask() - # if nan: do some compute - return dx.shape + if math.isnan(dx.size): + logger.warning("Computing data shape: Performance may be degraded") + dx.compute_chunk_sizes() - # TODODASK - what about nans (e.g. after da.unique dx.shape -> (nan,)) + return dx.shape @property @daskified(1) def size(self): """Number of elements in the data array. + **Performance** + + If the size of the data is unknown then it is calculated + immediately by executing all delayed operations. + **Examples:** >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) @@ -6072,9 +6121,13 @@ def size(self): """ dx = self._get_dask() - return dx.size + size = dx.size + if math.isnan(size): + logger.warning("Computing data size: Performance may be degraded") + dx.compute_chunk_sizes() + size = dx.size - # TODODASK - what about nans (e.g. after da.unique) + return size @property @daskified(1) @@ -6188,7 +6241,6 @@ def datetime_array(self): a.harden_mask() else: a.soften_mask() - # --- End: if return a @@ -8539,7 +8591,7 @@ def cyclic(self, axes=None, iscyclic=True): axes = [data_axes[i] for i in self._parse_axes(axes)] - # Never change the _cyclic attribute in-place + # Never change the value of the _cyclic attribute in-place if iscyclic: self._cyclic = cyclic_axes.union(axes) else: @@ -9512,15 +9564,14 @@ def harden_mask(self): """Force the mask to hard. Whether the mask of a masked array is hard or soft is - determined by its the `hardmask` property. `harden_mask` sets + determined by its `hardmask` property. `harden_mask` sets `hardmask` to True. + .. versionadded:: TODODASK - **Examples:** + .. seealso:: `hardmask`, `soften_mask` - >>> d = cf.Data([1, 2, 3]) - >>> d.hardmask - True + **Examples:** >>> d = cf.Data([1, 2, 3], hardmask=False) >>> d.hardmask @@ -9529,24 +9580,27 @@ def harden_mask(self): >>> d.hardmask True - """ - - def harden_mask(a): - if np.ma.isMA(a): - a.harden_mask() + >>> d = cf.Data([1, 2, 3], mask=[False, True, False]) + >>> d.hardmask + True + >>> d[1] = 999 + >>> print(d.array) + [1 -- 3] - return a + """ + self._dask_map_blocks(_harden_mask_chunk, dtype=self.dtype) + self._hardmask = True - if self._hardmask: - # Mask is already hard - return + def soften_mask(self): + """Force the mask to soft. - self._dask_map_blocks(harden_mask, dtype=self.dtype) + Whether the mask of a masked array is hard or soft is + determined by its `hardmask` property. `soften_mask` sets + `hardmask` to False. - self._hardmask = True + .. versionadded:: TODODASK - def soften_mask(self): - """TODODASK. + .. seealso:: `hardmask`, `harden_mask` **Examples:** @@ -9557,32 +9611,17 @@ def soften_mask(self): >>> d.hardmask False - """ - - def soften_mask(a): - if np.ma.isMA(a): - a.soften_mask() - - return a - - if not self._hardmask: - # Mask is already soft - return - - self._dask_map_blocks(soften_mask, dtype=self.dtype) + >>> d = cf.Data([1, 2, 3], mask=[False, True, False], hardmask=False) + >>> d.hardmask + False + >>> d[1] = 999 + >>> print(d.array) + [1 999 3] + """ + self._dask_map_blocks(_soften_mask_chunk, dtype=self.dtype) self._hardmask = False - def _set_mask_hardness(self, hardmask=None): - """TODODASK.""" - if hardmask is None: - hardmask = self.hardmask - - if hardmask: - self.harden_mask() - else: - self.soften_mask() - @_inplace_enabled(default=False) def filled(self, fill_value=None, inplace=False): """Replace masked elements with the fill value. @@ -10746,8 +10785,8 @@ def HDF_chunks(self, *chunks): chunks = chunks[0] if chunks is None: - # Clear all chunking. Never change the _HDF_chunks - # attribute in-place. + # Clear all chunking. Never change the value of the + # _HDF_chunks attribute in-place. self._HDF_chunks = None return org_HDF_chunks @@ -10758,7 +10797,7 @@ def HDF_chunks(self, *chunks): if _HDF_chunks.values() == [None] * self.ndim: _HDF_chunks = None - # Never change the _HDF_chunks attribute in-place + # Never change the value of the _HDF_chunks attribute in-place self._HDF_chunks = _HDF_chunks return org_HDF_chunks @@ -12142,7 +12181,7 @@ def squeeze(self, axes=None, inplace=False, i=False): hdf = self._HDF_chunks if hdf: - # Never change the _HDF_chunks attribute in-place + # Never change the value of the _HDF_chunks attribute in-place self._HDF_chunks = { axis: size for axis, size in hdf.items() if axis not in axes } @@ -13481,3 +13520,54 @@ def _broadcast(a, shape): tile = shape[0 : len(shape) - len(a_shape)] + tuple(tile[::-1]) return np.tile(a, tile) + + +"""Dask utilities to be called on chunks""" + + +def _harden_mask_chunk(a): + """Harden the mask of a masked `numpy` array. + + Has no effect if the array is not a masked array. + + :Parameters: + + a: `numpy.ndarray` + The array to have a hardened mask. + + :Returns: + + `numpy.ndarray` + The array with hardened mask. + + :Returns: + + `numpy.ndarray` + + """ + if np.ma.isMA(a): + a.harden_mask() + + return a + + +def _soften_mask_chunk(a): + """Soften the mask of a maked `numpy` array. + + Has no effect if the array is not a masked array. + + :Parameters: + + a: `numpy.ndarray` + The array to have a softened mask. + + :Returns: + + `numpy.ndarray` + The array with softened mask. + + """ + if np.ma.isMA(a): + a.soften_mask() + + return a diff --git a/cf/field.py b/cf/field.py index d40ed264f4..cf7e46ac80 100644 --- a/cf/field.py +++ b/cf/field.py @@ -460,13 +460,9 @@ def __getitem__(self, indices): f"{self.constructs.domain_axis_identity(_)!r} axis" ) - logger.debug( - f" roll, iaxis, shift = {roll} {iaxis} {shift}" - ) # pragma: no cover - - new = new.roll(iaxis, shift) + new = new.roll(shift=shift, axis=iaxis) else: - new = self.copy() + new = self.copy(array=False) # ------------------------------------------------------------ # Subspace the field construct's data @@ -482,7 +478,7 @@ def __getitem__(self, indices): logger.debug(" indices2 = {}".format(indices2)) # pragma: no cover logger.debug(" findices = {}".format(findices)) # pragma: no cover - new_data = new.data[tuple(findices)] + new_data = data[tuple(findices)] # Set sizes of domain axes data_axes = new.get_data_axes() diff --git a/cf/functions.py b/cf/functions.py index c8e48540e0..f45ca37429 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -32,11 +32,9 @@ from numpy import allclose as _x_numpy_allclose from numpy import ascontiguousarray as _numpy_ascontiguousarray from numpy import isclose as _x_numpy_isclose -from numpy import ndim as _numpy_ndim from numpy import shape as _numpy_shape from numpy import take as _numpy_take from numpy import tile as _numpy_tile -from numpy import where as _numpy_where from numpy.ma import all as _numpy_ma_all from numpy.ma import allclose as _numpy_ma_allclose from numpy.ma import is_masked as _numpy_ma_is_masked @@ -1943,12 +1941,8 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True): ) for i, (index, size) in enumerate(zip(parsed_indices, shape)): - is_slice = False - if isinstance(index, slice): - # -------------------------------------------------------- - # Index is a slice - # -------------------------------------------------------- - is_slice = True + if cyclic and isinstance(index, slice): + # Check for a cyclic slice start = index.start stop = index.stop step = index.step @@ -1998,11 +1992,8 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True): # -9:0:1 => [1, 2, 3, 4, 5, 6, 7, 8, 9] # -9:1:1 => [1, 2, 3, 4, 5, 6, 7, 8, 9, 0] # -10:0:1 => [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - if cyclic: - index = slice(0, stop - start, step) - roll[i] = -start - else: - index = slice(start, stop, step) + index = slice(0, stop - start, step) + roll[i] = -start elif step < 0 and 0 <= start < size and start - size <= stop < 0: # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] @@ -2012,46 +2003,15 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True): # 6:-4:-1 => [6, 5, 4, 3, 2, 1, 0, 9, 8, 7] # 0:-2:-1 => [0, 9] # 0:-10:-1 => [0, 9, 8, 7, 6, 5, 4, 3, 2, 1] - if cyclic: - index = slice(start - stop - 1, None, step) - roll[i] = -1 - stop - else: - index = slice(start, stop, step) + index = slice(start - stop - 1, None, step) + roll[i] = -1 - stop + elif keepdims and isinstance(index, Integral): + # Convert an integral index to a slice + if index == -1: + index = slice(-1, None, None) else: - start, stop, step = index.indices(size) - if ( - start == stop - or (start < stop and step < 0) - or (start > stop and step > 0) - ): - raise IndexError( - f"Invalid indices dimension with size {size}: {index}" - ) - - if step < 0 and stop < 0: - stop = None - index = slice(start, stop, step) - - elif isinstance(index, Integral): - # -------------------------------------------------------- - # Index is an integer - # -------------------------------------------------------- - if index < 0: - index += size - - if keepdims: index = slice(index, index + 1, 1) - is_slice = True - - if is_slice: - # If step is greater than one then make sure that - # index.stop isn't bigger than it needs to be - if cyclic and index.step > 1: - start, stop, step = index.indices(size) - div, mod = divmod(stop - start - 1, step) - stop = start + div * step + 1 - index = slice(start, stop, step) parsed_indices[i] = index diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index b0c8afac6a..73839b9d9a 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -855,7 +855,9 @@ def test_Data__getitem__(self): ([1, 4, 3], -2, [2, -1]), ([4, 1, 4], -2, [2, -1]), ): - self.assertEqual(d[indices].shape, (3, 1, 2)) + e = d[indices] + self.assertEqual(e.shape, (3, 1, 2)) + self.assertEqual(e._axes, d._axes) d.__keepdims_indexing__ = False self.assertFalse(d.__keepdims_indexing__) @@ -867,7 +869,10 @@ def test_Data__getitem__(self): ([4, 3, 4], -2, [2, -1]), ([1, 4, 4], -2, [2, -1]), ): - self.assertEqual(d[indices].shape, (3, 2)) + e = d[indices] + self.assertFalse(e.__keepdims_indexing__) + self.assertEqual(e.shape, (3, 2)) + self.assertEqual(e._axes, d._axes[0::2]) self.assertFalse(d.__keepdims_indexing__) d.__keepdims_indexing__ = True @@ -881,10 +886,18 @@ def test_Data__getitem__(self): d.cyclic(1) self.assertTrue((d[0, :6].array == [[0, 1, 2, 3, 4, 5]]).all()) e = d[0, -2:4] + self.assertEqual(e._axes, d._axes) self.assertEqual(e.shape, (1, 6)) self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3]]).all()) self.assertFalse(e.cyclic()) + d.__keepdims_indexing__ = False + e = d[:, 4] + self.assertEqual(e.shape, (3,)) + self.assertFalse(e.cyclic()) + self.assertEqual(e._axes, d._axes[0:1]) + d.__keepdims_indexing__ = True + e = d[0, -2:6] self.assertEqual(e.shape, (1, 8)) self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3, 4, 5]]).all()) @@ -944,7 +957,7 @@ def test_Data__getitem__(self): # TODODASK: Test __getitem__ with ancillary masks. Can only do # this when cf.Data.where has been daskified - def test_Data___setitem__(self): + def test_Data__setitem__(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -955,9 +968,9 @@ def test_Data___setitem__(self): else: a.soften_mask() - d = cf.Data(a.copy(), "metres", hardmask=hardmask) + d = cf.Data(a.copy(), "metres", hardmask=hardmask, chunks=(3, 5)) - a[:, 1] = cf.masked + a[:, 1] = np.ma.masked d[:, 1] = cf.masked a[0, 2] = -6 @@ -997,7 +1010,7 @@ def test_Data___setitem__(self): self.assertTrue((d.array == a).all()) self.assertTrue((d.array.mask == a.mask).all()) - # Cyclic + # Cyclic axes d.cyclic(1) self.assertTrue((d[0].array == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).all()) d[0, -1:1] = [-99, -1] @@ -1006,6 +1019,7 @@ def test_Data___setitem__(self): ) self.assertEqual(d.cyclic(), set([1])) + # Multiple list/1-d array indices with self.assertRaises(NotImplementedError): d[[1, 2], [0, 4, 1]] = 9 From 021e068d2558e3d3b152f8fe4c1e02113a61910a Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 9 Sep 2021 09:04:26 +0100 Subject: [PATCH 06/12] information --- cf/data/QUESTIONS.rst | 25 +++++++++++++++++++++++++ cf/data/README.rst | 26 ++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 cf/data/QUESTIONS.rst create mode 100644 cf/data/README.rst diff --git a/cf/data/QUESTIONS.rst b/cf/data/QUESTIONS.rst new file mode 100644 index 0000000000..9fcd8e3f12 --- /dev/null +++ b/cf/data/QUESTIONS.rst @@ -0,0 +1,25 @@ +Questions and answers +===================== + +A place to record randoms thought about the daskification of +`cf.Data`, possibly prior to starting an issue on GitHub. + +---- + +Q. When we run something that executes all of the lazy operations + (like `cf.Data.is_masked`), should/could we replace the dask array + with a "persisted" version of the computed data? If we did this, we + would want to have the ability to cache persisted chunks to disk, + as they came into being on each thread (see, for instance, + `chest`). Do do this or not this could be controlled by a + configuation setting. + +A. ? + +---- + +Q. + +A. ? + +---- diff --git a/cf/data/README.rst b/cf/data/README.rst new file mode 100644 index 0000000000..2b4cbe709e --- /dev/null +++ b/cf/data/README.rst @@ -0,0 +1,26 @@ +`cf.Data` developer notes +========================= + +Hardness of the mask +-------------------- + +Every `cf.Data` method that changes the dask array should consider +whether or not the mask hardness needs resetting before +returning. This will be necessary if there is the possibility that the +operation being applied to the dask array could lose the "memory" on +its chunks of whether or not the mask is hard. + +A common situation that causes a chunk to lose its memory of whether +or not the mask is hard is when a chunk could have contained a +unmasked `numpy` array prior to the operation, but the operation could +convert it to a masked `numpy` array. The new masked array will always +have the `numpy` default hardness (i.e. soft), which may be +incorrect. + +The mask hardness is reset as follows: + +.. code-block:: python + + self._reset_mask_hardness() + +See `cf.Data.__setitem__` for an example. From 9c1a4c42c7204d580f598d2a2c0528cb39cae449 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 9 Sep 2021 09:05:08 +0100 Subject: [PATCH 07/12] tidy --- cf/data/data.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index a18cdb243e..2999d7a86a 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1199,7 +1199,7 @@ def __setitem__(self, indices, value): # Reset the mask hardness, otherwise it could be incorrect in # the case that a chunk that was not a masked array is # assigned missing values. - self.hardmask = self.hardmask + self._reset_mask_hardness() return @@ -1416,6 +1416,18 @@ def _dask_map_blocks(self, func, **kwargs): return dx + def _reset_mask_hardness(self): + """Re-apply the mask hardness to the dask array. + + .. versionadded:: TODODASK + + :Returns: + + `None` + + """ + self.hardmask = self.hardmask + @_inplace_enabled(default=False) def diff(self, axis=-1, n=1, inplace=False): """Calculate the n-th discrete difference along the given axis. @@ -5936,13 +5948,6 @@ def is_masked(self): True """ - # TODODASK: (General point) When we run something that - # executes all of the lazy operations (like - # is_masked), should/could we replace the dask array - # of self with a "persisted" version of the computed - # data? If we did this, we would want to have the - # ability to cache persisted chunks to disk, as they - # came into being on each thread. def is_masked(a): out = np.ma.is_masked(a) @@ -10306,7 +10311,8 @@ def datum(self, *index): """Return an element of the data array as a standard Python scalar. - TODODASK consider renameing/aliasing to 'item' + TODODASK: consider renameing/aliasing to 'item'. Might depend + on whether or not the APIs are the same. The first and last elements are always returned with ``d.datum(0)`` and ``d.datum(-1)`` respectively, even if the data From 9d3f5a7b84b0ef661d7635142231e4935aaa2d0f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 9 Sep 2021 10:39:40 +0100 Subject: [PATCH 08/12] hardmask --- cf/data/README.rst | 12 +++---- cf/data/data.py | 80 ++++++++++++++++++++------------------------ cf/test/test_Data.py | 2 +- 3 files changed, 43 insertions(+), 51 deletions(-) diff --git a/cf/data/README.rst b/cf/data/README.rst index 2b4cbe709e..6e404b720e 100644 --- a/cf/data/README.rst +++ b/cf/data/README.rst @@ -4,7 +4,7 @@ Hardness of the mask -------------------- -Every `cf.Data` method that changes the dask array should consider +Any `cf.Data` method that changes the dask array should consider whether or not the mask hardness needs resetting before returning. This will be necessary if there is the possibility that the operation being applied to the dask array could lose the "memory" on @@ -17,10 +17,8 @@ convert it to a masked `numpy` array. The new masked array will always have the `numpy` default hardness (i.e. soft), which may be incorrect. -The mask hardness is reset as follows: +The mask hardness is most easily reset with the +`cf.Data._reset_mask_hardness` method. -.. code-block:: python - - self._reset_mask_hardness() - -See `cf.Data.__setitem__` for an example. +`cf.Data.__setitem__` and `cf.Data.where` are examples of methods that +need to reset the mask in this manner. diff --git a/cf/data/data.py b/cf/data/data.py index 2999d7a86a..6ede789f14 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1382,20 +1382,26 @@ def _del_dask(self, default=ValueError(), delete_source=True): return out - def _dask_map_blocks(self, func, **kwargs): - """Apply a function to the dask array using `map_blocks`. + def _map_blocks(self, func, **kwargs): + """Apply a function to the data in-place. + + .. note:: It may be necessary for a call to `_map_blocks` to + be followed by a call to `_reset_mask_hardness`. .. versionadded:: TODODASK + .. seealso:: `_reset_mask_hardness` + :Parameters: func: - The funciton to be applied to each chunk of the dask + The funciton to be applied to the data, via + `dask.Array.map_blocks`, to each chunk of the dask array. kwargs: optional Keyword arguments passed to the - `dask.Array.map_blocks` method of the dask array. + `dask.Array.map_blocks` method. :Returns: @@ -1405,7 +1411,7 @@ def _dask_map_blocks(self, func, **kwargs): **Examples:** >>> d = cf.Data([1, 2, 3]) - >>> dx = d._dask_map_blocks(lambda x: x / 2, dtype=float) + >>> dx = d._map_blocks(lambda x: x / 2) >>> print(d.array) [0.5 1. 1.5] @@ -5652,8 +5658,10 @@ def _HDF_chunks(self): def _hardmask(self): """Storage for the mask hardness. - Contains a `bool`, where `True` denotes a hard mask and `False` - denotes a soft mask. + Contains a `bool`, where `True` denotes a hard mask and + `False` denotes a soft mask. + + See `hardmask` for details. """ return self._custom["_hardmask"] @@ -5662,10 +5670,6 @@ def _hardmask(self): def _hardmask(self, value): self._custom["_hardmask"] = value - @_hardmask.deleter - def _hardmask(self): - del self._custom["_hardmask"] - @property @daskified(1) def _axes(self): @@ -5758,9 +5762,9 @@ def Units(self, value): dtype = _dtype_float32 else: dtype = _dtype_float - # --- End: if - self._dask_map_blocks( + # Apply the units conversion to the data + self._map_blocks( partial( Units.conform, from_units=old_units, @@ -5874,18 +5878,22 @@ def hardmask(self): """Hardness of the mask. If the `hardmask` attribute is `True`, i.e. there is a hard - mask, then unmasking an entry will silently not occur. This - feature prevents overwriting the mask. + mask, then unmasking an entry will silently not occur. This is + the default, and prevents overwriting the mask. + + If the `hardmask` attribute is `False`, i.e. there is a soft + mask, then masked entries may be overwritten with non-missing + values. - To allow the unmasking of an entries when the data has a hard - mask, the mask must first to be softened, either by setting - the `hardmask` attribute to False or equivalently with the - `soften_mask` method. + To allow the unmasking of masked values, the mask must be + softened by setting the `hardmask` attribute to False, or + equivalently with the `soften_mask` method. The mask can be hardened by setting the `hardmask` attribute - to True or equivalently with the `harden_mask` method. + to True, or equivalently with the `harden_mask` method. - .. seealso:: `harden_mask`, `soften_mask`, `where` + .. seealso:: `harden_mask`, `soften_mask`, `where`, + `__setitem__` **Examples:** @@ -5895,29 +5903,15 @@ def hardmask(self): >>> d[0] = cf.masked >>> print(d.array) [-- 2 3] - >>> d[...]= 9 + >>> d[...]= 999 >>> print(d.array) - [-- 9 9] - >>> d.soften_mask() + [-- 999 999] + >>> d.hardmask = False >>> d.hardmask False >>> d[...] = -1 - False >>> print(d.array) [-1 -1 -1] - >>> d.harden_mask() - >>> d.hardmask - True - >>> d[0] = cf.masked - >>> d = d.where(True, 9) - >>> print(d.array) - [-- 9 9] - >>> d.soften_mask() - >>> d.hardmask - False - >>> d = d.where(True, 9) - >>> print(d.array) - [9 9 9] """ return self._hardmask @@ -9570,7 +9564,7 @@ def harden_mask(self): Whether the mask of a masked array is hard or soft is determined by its `hardmask` property. `harden_mask` sets - `hardmask` to True. + `hardmask` to `True`. .. versionadded:: TODODASK @@ -9593,7 +9587,7 @@ def harden_mask(self): [1 -- 3] """ - self._dask_map_blocks(_harden_mask_chunk, dtype=self.dtype) + self._map_blocks(_harden_mask_chunk, dtype=self.dtype) self._hardmask = True def soften_mask(self): @@ -9601,7 +9595,7 @@ def soften_mask(self): Whether the mask of a masked array is hard or soft is determined by its `hardmask` property. `soften_mask` sets - `hardmask` to False. + `hardmask` to `False`. .. versionadded:: TODODASK @@ -9621,10 +9615,10 @@ def soften_mask(self): False >>> d[1] = 999 >>> print(d.array) - [1 999 3] + [ 1 999 3] """ - self._dask_map_blocks(_soften_mask_chunk, dtype=self.dtype) + self._map_blocks(_soften_mask_chunk, dtype=self.dtype) self._hardmask = False @_inplace_enabled(default=False) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 73839b9d9a..6dd4648ab5 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -824,7 +824,7 @@ def test_Data__getitem__(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - d = cf.Data(np.ma.arange(450).reshape(9, 10, 5)) + d = cf.Data(np.ma.arange(450).reshape(9, 10, 5), chunks=(4, 5, 1)) for indices in ( Ellipsis, From 742c3e202d1fba040598493edf780477d39ec438 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 13 Sep 2021 15:58:56 +0100 Subject: [PATCH 09/12] dask reset_mask_hardness, docs --- cf/data/data.py | 145 +++++++++++++++++++++++++++++++++-------------- cf/data/utils.py | 8 +-- 2 files changed, 105 insertions(+), 48 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 6ede789f14..91f228481c 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -585,7 +585,7 @@ def __init__( self._Units = units # Store the dask array - self._set_dask(array, delete_source=False) + self._set_dask(array, delete_source=False, reset_mask_hardness=False) # Set the mask hardness on each chunk. self.hardmask = hardmask @@ -1049,15 +1049,17 @@ def __getitem__(self, indices): ] dx = dx[tuple(slice_indices)] else: - raise ValueError("Orthogonal indexing is currently required.") + raise NotImplementedError( + "Non-orthogonal indexing has not yet been implemented" + ) # ------------------------------------------------------------ # Set the subspaced dask array # ------------------------------------------------------------ - new._set_dask(dx) + new._set_dask(dx, reset_mask_hardness=False) # ------------------------------------------------------------ - # Set the axis identifiers for the subspace + # Get the axis identifiers for the subspace # ------------------------------------------------------------ shape0 = shape if keepdims: @@ -1160,11 +1162,11 @@ def __setitem__(self, indices, value): # TODODASK: The inherited algorithm that does assignment # for multiple list/1-d array indices # (cfdm.Data._set_subspace) won't work when the - # 1-d array is a dask array because the array - # may need to be computed at __setitem__ - # runtime, which is not desirable. Until this - # can be fixed, it's easiest to disallow this - # case, that was allowed pre-dask. + # 1-d array is a dask array because it may need + # to be computed at __setitem__ runtime, which + # is not desirable. Until this can be fixed, + # it's easiest to disallow this case, that was + # allowed pre-dask. # Roll axes with cyclic slices if roll: @@ -1222,13 +1224,21 @@ def __orthogonal_indexing__(self): `__setitem__`, `netCDF4.Variable.__orthogonal_indexing__` - **Examples:** + **Examples** - >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) - >>> d[[0], [0, 2]].shape + >>> d = cf.Data([[1, 2, 3], + ... [4, 5, 6]]) + >>> e = d[[0], [0, 2]] + >>> e.shape (1, 2) - >>> d[[0, 1], [0, 2]].shape + >>> print(e.array) + [[1 3]] + >>> e = d[[0, 1], [0, 2]] + >>> e.shape (2, 2) + >>> print(e.array) + [[1 3] + [4 6]] """ return True @@ -1251,23 +1261,57 @@ def __keepdims_indexing__(self): .. seealso:: `__orthogonal_indexing__`, `__getitem__`, `__setitem__` - **Examples:** + **Examples** - >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) - >>> d.__keepdims_indexing__ = True - >>> d[0].shape + >>> d = cf.Data([[1, 2, 3], + ... [4, 5, 6]]) + >>> d.__keepdims_indexing__ + True + >>> e = d[0] + >>> e.shape (1, 3) - >>> d[:, 1].shape + >>> print(e.array) + [[1 2 3]] + + >>> d.__keepdims_indexing__ + True + >>> e = d[:, 1] + >>> e.shape (2, 1) - >>> d[0, 1].shape + >>> print(e.array) + [[2] + [5]] + + >>> d.__keepdims_indexing__ + True + >>> e = d[0, 1] + >>> e.shape (1, 1) + >>> print(e.array) + [[2]] + >>> d.__keepdims_indexing__ = False - >>> d[0].shape + >>> e = d[0] + >>> e.shape (3,) - >>> d[:, 1].shape + >>> print(e.array) + [1 2 3] + + >>> d.__keepdims_indexing__ + False + >>> e = d[:, 1] + >>> e.shape (2,) - >>> d[0, 1].shape + >>> print(e.array) + [2 5] + + >>> d.__keepdims_indexing__ + False + >>> e = d[0, 1] + >>> e.shape () + >>> print(e.array) + 2 """ return self._custom.get("__keepdims_indexing__", True) @@ -1294,12 +1338,14 @@ def _get_dask(self): """ return self._custom["dask"] - def _set_dask(self, array, copy=False, delete_source=True): + def _set_dask( + self, array, copy=False, delete_source=True, reset_mask_hardness=True + ): """Set the dask array. .. versionadded:: TODODASK - .. seealso:: `_get_dask`, `_del_dask` + .. seealso:: `_get_dask`, `_del_dask`, `_reset_mask_hardness` :Parameters: @@ -1311,7 +1357,14 @@ def _set_dask(self, array, copy=False, delete_source=True): default the dask array is not copied. delete_source: `bool`, optional - TODODASK + If False then do not delete a compressed source array, + if one exists, after setting the new dask array. By + default a compressed source array is deleted. + + reset_mask_hardness: `bool`, optional + If False then do not reset the mask hardness after + setting the new dask array. By default the mask + hardness is re-applied. :Returns: @@ -1328,6 +1381,9 @@ def _set_dask(self, array, copy=False, delete_source=True): # guarantee its consistency with the new dask array. self._del_Array(None) + if reset_mask_hardness: + self._reset_mask_hardness() + def _del_dask(self, default=ValueError(), delete_source=True): """Remove the dask array. @@ -1385,8 +1441,9 @@ def _del_dask(self, default=ValueError(), delete_source=True): def _map_blocks(self, func, **kwargs): """Apply a function to the data in-place. - .. note:: It may be necessary for a call to `_map_blocks` to - be followed by a call to `_reset_mask_hardness`. + .. note:: This method does not reset the mask hardness. It may + be necessary for a call to `_map_blocks` to be + followed by a call to `_reset_mask_hardness`. .. versionadded:: TODODASK @@ -1396,12 +1453,12 @@ def _map_blocks(self, func, **kwargs): func: The funciton to be applied to the data, via - `dask.Array.map_blocks`, to each chunk of the dask - array. + `dask.array.Array.map_blocks`, to each chunk of the + dask array. kwargs: optional Keyword arguments passed to the - `dask.Array.map_blocks` method. + `dask.array.Array.map_blocks` method. :Returns: @@ -1418,7 +1475,7 @@ def _map_blocks(self, func, **kwargs): """ dx = self._get_dask() dx = dx.map_blocks(func, **kwargs) - self._set_dask(dx) + self._set_dask(dx, reset_mask_hardness=False) return dx @@ -1427,6 +1484,8 @@ def _reset_mask_hardness(self): .. versionadded:: TODODASK + .. seealso:: `hardmask`, `harden_mask`, `soften_mask` + :Returns: `None` @@ -2212,7 +2271,7 @@ def persist(self, inplace=False): dx = self._get_dask() dx = dx.persist() - d._set_dask(dx) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -3151,7 +3210,7 @@ def rechunk( dx = d._get_dask() dx = dx.rechunk(chunks, threshold, block_size_limit, balance) - d._set_dask(dx, delete_source=False) + d._set_dask(dx, delete_source=False, reset_mask_hardness=False) return d @@ -4417,7 +4476,7 @@ def _unary_operation(self, operation): dx = self._get_dask() dx = getattr(operator, operation)(dx) - out._set_dask(dx) + out._set_dask(dx, reset_mask_hardness=False) return out @@ -5840,7 +5899,7 @@ def dtype(self, value): # dask array if dx.dtype != value: dx = dx.astype(value) - self._set_dask(dx) + self._set_dask(dx, reset_mask_hardness=False) @property def fill_value(self): @@ -9129,7 +9188,7 @@ def insert_dimension(self, position=0, inplace=False): dx = d._get_dask() dx = dx.reshape(shape) - d._set_dask(dx) + d._set_dask(dx, reset_mask_hardness=False) # Expand _axes axis = new_axis_identifier(d._axes) @@ -9587,7 +9646,7 @@ def harden_mask(self): [1 -- 3] """ - self._map_blocks(_harden_mask_chunk, dtype=self.dtype) + self._map_blocks(_cf_harden_mask, dtype=self.dtype) self._hardmask = True def soften_mask(self): @@ -9618,7 +9677,7 @@ def soften_mask(self): [ 1 999 3] """ - self._map_blocks(_soften_mask_chunk, dtype=self.dtype) + self._map_blocks(_cf_soften_mask, dtype=self.dtype) self._hardmask = False @_inplace_enabled(default=False) @@ -10762,7 +10821,7 @@ def flip(self, axes=None, inplace=False, i=False): dx = d._get_dask() dx = dx[tuple(index)] - d._set_dask(dx) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -12174,7 +12233,7 @@ def squeeze(self, axes=None, inplace=False, i=False): # one size 1 axis needs squeezing. dx = d._get_dask() dx = dx.squeeze(axis=tuple(axes)) - d._set_dask(dx) + d._set_dask(dx, reset_mask_hardness=False) # Remove the squeezed axes names d._axes = [axis for i, axis in enumerate(d._axes) if i not in axes] @@ -12726,7 +12785,7 @@ def roll(self, axis, shift, inplace=False, i=False): dx = d._get_dask() dx = da.roll(dx, shift, axis=axis) - d._set_dask(dx) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -13525,7 +13584,7 @@ def _broadcast(a, shape): """Dask utilities to be called on chunks""" -def _harden_mask_chunk(a): +def _cf_harden_mask(a): """Harden the mask of a masked `numpy` array. Has no effect if the array is not a masked array. @@ -13551,7 +13610,7 @@ def _harden_mask_chunk(a): return a -def _soften_mask_chunk(a): +def _cf_soften_mask(a): """Soften the mask of a maked `numpy` array. Has no effect if the array is not a masked array. diff --git a/cf/data/utils.py b/cf/data/utils.py index ca1584ba85..fb077ebac9 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -2,13 +2,11 @@ from functools import lru_cache, partial from itertools import product -import numpy as np - import dask.array as da +import numpy as np -from ..cfdatetime import dt2rt, st2rt, rt2dt from ..cfdatetime import dt as cf_dt - +from ..cfdatetime import dt2rt, rt2dt, st2rt from ..units import Units @@ -379,7 +377,7 @@ def conform_units(value, units): unchanged; * if the value units are equivalent to *units* then a copy of - *value* with the same units as data* is returned; + *value* convert to *units* is returned; * if the value units are not equivalent to *units* then an exception is raised. From 59bc506db80b10ac4025bb10cd0dc31691c42bde Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 4 Oct 2021 18:00:54 +0100 Subject: [PATCH 10/12] Typos Co-authored-by: Sadie L. Bartholomew --- cf/data/QUESTIONS.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/QUESTIONS.rst b/cf/data/QUESTIONS.rst index 9fcd8e3f12..77b8ebf0ae 100644 --- a/cf/data/QUESTIONS.rst +++ b/cf/data/QUESTIONS.rst @@ -11,7 +11,7 @@ Q. When we run something that executes all of the lazy operations with a "persisted" version of the computed data? If we did this, we would want to have the ability to cache persisted chunks to disk, as they came into being on each thread (see, for instance, - `chest`). Do do this or not this could be controlled by a + `chest`). To do this or not do this could be controlled by a configuation setting. A. ? From 142556d6bd4f303d3b535ca563852ded136b14b0 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 4 Oct 2021 18:01:14 +0100 Subject: [PATCH 11/12] Typos Co-authored-by: Sadie L. Bartholomew --- cf/data/QUESTIONS.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/QUESTIONS.rst b/cf/data/QUESTIONS.rst index 77b8ebf0ae..9014430d4e 100644 --- a/cf/data/QUESTIONS.rst +++ b/cf/data/QUESTIONS.rst @@ -1,7 +1,7 @@ Questions and answers ===================== -A place to record randoms thought about the daskification of +A place to record random thoughts about the daskification of `cf.Data`, possibly prior to starting an issue on GitHub. ---- From aecd89b5423d4c0b546c5f2da91321b05da7af95 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 4 Oct 2021 18:24:02 +0100 Subject: [PATCH 12/12] Correct docstring Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index 91f228481c..6839572a6a 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1246,7 +1246,7 @@ def __orthogonal_indexing__(self): @property @daskified(1) def __keepdims_indexing__(self): - """Flag to indicate whether orthogonal indexing is supported. + """Flag to indicate whether dimensions indexed with integers are kept. If set to True (the default) then providing a single integer as a single-axis index does *not* reduce the number of array