From 3acffce895c9c23952e80bbabedcca39014dc519 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Wed, 1 Sep 2021 23:07:38 +0100
Subject: [PATCH 01/12] cyclic axes

---
 cf/data/data.py      | 223 +++++++++++++++++++++++++++++++++----------
 cf/functions.py      | 176 ++--------------------------------
 cf/test/test_Data.py |  96 +++++++++++++++++++
 3 files changed, 277 insertions(+), 218 deletions(-)

diff --git a/cf/data/data.py b/cf/data/data.py
index edea334fe9..6d39ea30c3 100644
--- a/cf/data/data.py
+++ b/cf/data/data.py
@@ -4,6 +4,7 @@
 from itertools import product
 from json import dumps as json_dumps
 from json import loads as json_loads
+from numbers import Integral
 from operator import mul
 
 try:
@@ -946,8 +947,6 @@ def __getitem__(self, indices):
         if indices is Ellipsis:
             return self.copy()
 
-        d = self
-
         auxiliary_mask = ()
         try:
             arg = indices[0]
@@ -957,54 +956,125 @@ def __getitem__(self, indices):
             if isinstance(arg, str) and arg == "mask":
                 auxiliary_mask = indices[1]
                 indices = indices[2:]
-
+        keepdims = self.__keepdims_indexing__
         indices, roll = parse_indices(
-            d.shape, indices, cyclic=True, numpy_indexing=False
+            self.shape,
+            indices,
+            cyclic=True,
+            keepdims=keepdims,
         )
 
-        # TODODASK - sort out the "numpy" environment
-
-        # TODODASK - multiple list indices
-
-        axes = d._axes
-        cyclic_axes = d._cyclic
-
+        axes = self._axes
+        cyclic_axes = self._cyclic
+        print ('parsed_indices=', indices)
         if roll:
             # Roll axes with cyclic slices. For example, if slice(-2,
-            # 3) has been requested on a cyclic axis (and we're not
-            # using numpy indexing), then we roll that axis by two
-            # points and apply the slice(0, 5) instead.
-            if cyclic_axes.intersection([axes[i] for i in roll]):
+            # 3) has been requested on a cyclic axis, then we roll
+            # that axis by two points and apply the slice(0, 5)
+            # instead.
+            if not cyclic_axes.issuperset([axes[i] for i in roll]):
                 raise IndexError(
                     "Can't take a cyclic slice of a non-cyclic axis"
                 )
 
-            d = d.roll(axis=tuple(roll.keys()), shift=tuple(roll.values()))
-            new = d
+            new = self.roll(
+                axis=tuple(roll.keys()), shift=tuple(roll.values())
+            )
+            dx = new._get_dask()
         else:
-            new = d.copy(array=False)
+            new = self.copy(array=False)
+            dx = self._get_dask()
+
+        # Subspace the array
+        if self.__orthogonal_indexing__:
+            # Apply 'orthogonal indexing': indices that are 1-d arrays
+            # or lists subspace along each dimension
+            # independently. This behaviour is similar to Fortran, but
+            # different to dask.
+            axes_with_list_indices = [
+                i
+                for i, x in enumerate(indices)
+                if not isinstance(x, (slice, Integral))
+            ]
+            n_axes_with_list_indices = len(axes_with_list_indices)
 
-        # Get the subspaced dask array
-        dx = d._get_dask()
-        dx = dx[tuple(indices)]
+            if n_axes_with_list_indices < 2:
+                # At most one axis has a list-of-integers index so do
+                # a normal dask subspace
+                dx = dx[tuple(indices)]
+                print (indices)
+            else:
+                # At least two axes have list-of-integers indices so
+                # we can't do a normal dask subspace
+
+                # Subspace axes which have list-of-integer indices
+                for axis in axes_with_list_indices:
+                    dx = da.take(dx, indices[axis], axis=axis)
+
+                if n_axes_with_list_indices < len(indices):
+                    # Subspace axes which didn't have list-of-integer
+                    # indices. (Do this after subspacing axes which
+                    # have list-of-integer indices in case
+                    # __keepdims_indexing__ is False.)
+                    slice_indices = [
+                        slice(None) if i in axes_with_list_indices else x
+                        for i, x in enumerate(indices)
+                    ]
+                    dx = dx[tuple(slice_indices)]
+
+        else:
+            raise ValueError("Orthogonal indexing is currently required.")
+
+        # ------------------------------------------------------------
+        # Note which axes have not been dropped from the result
+        # (i.e. those indexed with non-integral indices)
+        # ------------------------------------------------------------
+        if keepdims:
+            axes_with_non_integral_indices = axes
+            shape = self.shape
+        else:
+            axes_with_non_integral_indices = [
+                axis
+                for axis, x in zip(axes, indices)
+                if not isinstance(x, Integral)
+            ]
+            if len(axes_with_non_integral_indices) < len(indices):
+                # Never change the _axes attribute in-place
+                new._axes = axes_with_non_integral_indices
+
+                if cyclic_axes:
+                    cyclic_axes = set(
+                        [axis
+                         for axis in cyclic_axes
+                         if axis in axes_with_non_integral_indices]
+                    )
+                    shape = [n
+                             for n, axis in zip(self.shape, axes)
+                             if axis in axes_with_non_integral_indices]
+
+        # Set the subspaced dask array
         new._set_dask(dx)
 
-        # Apply any auxiliary mask
+        # ------------------------------------------------------------
+        # Apply auxiliary masks
+        # ------------------------------------------------------------
         for mask in auxiliary_mask:
             new.where(mask, cf_masked, inplace=True)
 
-        # Cyclic axes which have been reduced in size are no longer
-        # cyclic
+        # ------------------------------------------------------------
+        # Cyclic axes
+        # ------------------------------------------------------------
         if cyclic_axes:
-            x = [
-                axis
-                for axis, n0, n1 in zip(axes, d.shape, new.shape)
-                if n1 != n0 and axis in cyclic_axes
-            ]
+            # Cyclic axes that have been reduced in size are no longer
+            # considered to be cyclic
+            x = [axis
+                 for axis, n0, n1 in zip(axes_with_non_integral_indices,
+                                         shape,
+                                         new.shape)
+                 if n1 != n0 and axis in cyclic_axes]
             if x:
                 # Never change the _cyclic attribute in-place
                 new._cyclic = cyclic_axes.difference(x)
-        # --- End: if
 
         return new
 
@@ -1042,10 +1112,20 @@ def __setitem__(self, indices, value):
         # TODODASK - sort out the "numpy" environment
 
         indices, roll = parse_indices(
-            self.shape, indices, cyclic=True, numpy_indexing=False
+            self.shape, indices, cyclic=True, keepdims=True
         )
         indices = tuple(indices)
 
+        axes_with_list_indices = [
+            True for x in indices if not isinstance(x, slice)
+        ]
+        if len(axes_with_list_indices) > 1:
+            raise IndexError(
+                "Currently limited to at most one dimension's assignment "
+                "index being a 1-d array of integers or booleans. "
+                f"Got: {indices}"
+            )
+
         if roll:
             # Roll axes with cyclic slices. For example, if assigning
             # to slice(-2, 3) has been requested on a cyclic axis (and
@@ -1081,7 +1161,6 @@ def __setitem__(self, indices, value):
                     f"Can't assign values with units {value_units!r} "
                     f"to data with units {self_units!r}"
                 )
-        # --- End: try
 
         # Set the mask hardness, in case any previous operations have
         # inadvertently changed it.
@@ -1099,6 +1178,61 @@ def __setitem__(self, indices, value):
 
         return
 
+    # ----------------------------------------------------------------
+    # Indexing behaviour attributes
+    # ----------------------------------------------------------------
+    @property
+    @daskified(1)
+    def __orthogonal_indexing__(self):
+        """Flag to indicate that orthogonal indexing is supported.
+
+        Always True, indicating that 'orthogonal indexing' is
+        applied. This means that indices that are 1-d arrays or lists
+        subspace along each dimension independently. This behaviour is
+        similar to Fortran, but different to `numpy`.
+
+        .. versionadded:: TODODASK
+
+        .. seealso:: __keepdims_indexing__,
+                     netCDF4.Variable.__orthogonal_indexing__
+
+        """
+        return True
+
+    @property
+    @daskified(1)
+    def __keepdims_indexing__(self):
+        """Flag to indicate whether orthogonal indexing is supported.
+
+        If True then providing a single integer as a single-axis index
+        does *not* reduce the number of array dimensions by 1. This
+        behaviour is different to `numpy`.
+
+        If False then providing a single integer as a single-axis
+        index reduces the number of array dimensions by 1. This
+        behaviour is the same as `numpy`.
+
+        .. versionadded:: TODODASK
+
+        .. seealso:: __orthogonal_indexing__
+
+        **Examples:**
+
+        >>> d = cf.Data([[1, 2], [3, 4]])
+        >>> d.__keepdims_indexing__ = True
+        >>> d[0].shape
+        (1, 2)
+        >>> d.__keepdims_indexing__ = False
+        >>> d[0].shape
+        (2,)
+
+        """
+        return self._custom.get("__keepdims_indexing__", True)
+
+    @__keepdims_indexing__.setter
+    def __keepdims_indexing__(self, value):
+        self._custom["__keepdims_indexing__"] = bool(value)
+
     # ----------------------------------------------------------------
     # Private dask methods
     # ----------------------------------------------------------------
@@ -5472,12 +5606,13 @@ def _hardmask(self):
     def _axes(self):
         """Storage for the axis identifiers.
 
-        Contains an ordered sequence of unique (within this `Data`
-        instance) identifiers for each array axis.
+        Contains an ordered sequence of identifiers for each array
+        axis.
 
-        .. note:: When an axis identifier is removed from the `_axes`
-                  attribute then it is automatically also removed from
-                  the `_cyclic` attribute.
+        .. note:: When the axis identifiers are reset, then any axis
+                  identifier named by the `_cyclic` attribute which is
+                  not in the new `_axes` set is automatically removed
+                  `_cyclic` attribute.
 
         """
         return self._custom["_axes"]
@@ -5488,23 +5623,11 @@ def _axes(self, value):
         self._custom["_axes"] = value
 
         # Update cyclic: Remove cyclic axes that are not in the new
-        # axes
+        # axes (never update _cyclic in-place)
         cyclic = self._cyclic
         if cyclic:
             self._cyclic = cyclic.intersection(value)
 
-    @property
-    def numpy_indexing(self):
-        """TODODASK - probably thewrong name - need a gneral numpy
-        compatability flag. See also confg settings
-
-        """
-        return self._custom.get("numpy_indexing", False)
-
-    @numpy_indexing.setter
-    def numpy_indexing(self, value):
-        self._custom["numpy_indexing"] = bool(value)
-
     # ----------------------------------------------------------------
     # Dask attributes
     # ----------------------------------------------------------------
diff --git a/cf/functions.py b/cf/functions.py
index f8d4cfd08c..d9c46e1872 100644
--- a/cf/functions.py
+++ b/cf/functions.py
@@ -13,6 +13,7 @@
 from hashlib import md5 as hashlib_md5
 from marshal import dumps as marshal_dumps
 from math import ceil as math_ceil
+from numbers import Integral
 from os import getpid, listdir, mkdir
 from os.path import abspath as _os_path_abspath
 from os.path import dirname as _os_path_dirname
@@ -29,14 +30,10 @@
 from numpy import __version__ as _numpy__version__
 from numpy import all as _numpy_all
 from numpy import allclose as _x_numpy_allclose
-from numpy import array as _numpy_array
 from numpy import ascontiguousarray as _numpy_ascontiguousarray
-from numpy import integer as _numpy_integer
 from numpy import isclose as _x_numpy_isclose
 from numpy import ndim as _numpy_ndim
 from numpy import shape as _numpy_shape
-from numpy import sign as _numpy_sign
-from numpy import size as _numpy_size
 from numpy import take as _numpy_take
 from numpy import tile as _numpy_tile
 from numpy import where as _numpy_where
@@ -1880,7 +1877,7 @@ def _numpy_isclose(a, b, rtol=None, atol=None):
 # TODODASK - sort out the "numpy" environment
 
 
-def parse_indices(shape, indices, cyclic=False, numpy_indexing=False):
+def parse_indices(shape, indices, cyclic=False, keepdims=True):
     """TODODASK.
 
     :Parameters:
@@ -1945,16 +1942,6 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False):
         parsed_indices.extend([slice(None)] * (ndim - len_parsed_indices))
 
     if not ndim and parsed_indices:
-        # # If data is scalar then allow it to be indexed with an
-        # # equivalent to [0]
-        # if (len_parsed_indices == 1 and
-        #     parsed_indices[0] in (0,
-        #                           -1,
-        #                           slice(0, 1),
-        #                           slice(-1, None, -1),
-        #                           slice(None, None, None))):
-        #     parsed_indices = []
-        # else:
         raise IndexError(
             "Scalar array can only be indexed with () or Ellipsis"
         )
@@ -2015,7 +2002,7 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False):
                 # -9:0:1  => [1, 2, 3, 4, 5, 6, 7, 8, 9]
                 # -9:1:1  => [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
                 # -10:0:1 => [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
-                if cyclic and not numpy_indexing:
+                if cyclic:
                     index = slice(0, stop - start, step)
                     roll[i] = -start
                 else:
@@ -2029,7 +2016,7 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False):
                 # 6:-4:-1  => [6, 5, 4, 3, 2, 1, 0, 9, 8, 7]
                 # 0:-2:-1  => [0, 9]
                 # 0:-10:-1 => [0, 9, 8, 7, 6, 5, 4, 3, 2, 1]
-                if cyclic and not numpy_indexing:
+                if cyclic:
                     index = slice(start - stop - 1, None, step)
                     roll[i] = -1 - stop
                 else:
@@ -2043,155 +2030,26 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False):
                     or (start > stop and step > 0)
                 ):
                     raise IndexError(
-                        "Invalid indices dimension with size {}: {}".format(
-                            size, index
-                        )
+                        f"Invalid indices dimension with size {size}: {index}"
                     )
 
                 if step < 0 and stop < 0:
                     stop = None
                 index = slice(start, stop, step)
 
-        elif isinstance(index, (int, _numpy_integer)):
+        elif isinstance(index, Integral):
             # --------------------------------------------------------
             # Index is an integer
             # --------------------------------------------------------
             if index < 0:
                 index += size
 
-            if not numpy_indexing:
+            if keepdims:
+                print("CONVERT")
                 index = slice(index, index + 1, 1)
                 is_slice = True
-        else:
-            convert2positve = True
-            if getattr(
-                getattr(index, "dtype", None), "kind", None
-            ) == "b" or isinstance(index[0], bool):
-                # ----------------------------------------------------
-                # Index is a sequence of booleans
-                # ----------------------------------------------------
-                # Convert booleans to non-negative integers. We're
-                # assuming that anything with a dtype attribute also
-                # has a size attribute.
-                if _numpy_size(index) != size:
-                    raise IndexError(
-                        "Incorrect number ({}) of boolean indices for "
-                        "dimension with size {}: {}".format(
-                            _numpy_size(index), size, index
-                        )
-                    )
-
-                index = _numpy_where(index)[0]
-                convert2positve = False
-
-            if not _numpy_ndim(index):
-                if index < 0:
-                    index += size
-
-                index = slice(index, index + 1, 1)
-                is_slice = True
-            else:
-                len_index = len(index)
-                if len_index == 1:
-                    index = index[0]
-                    if index < 0:
-                        index += size
-
-                    index = slice(index, index + 1, 1)
-                    is_slice = True
-                elif len_index:
-                    if convert2positve:
-                        # Convert to non-negative integer numpy array
-                        index = _numpy_array(index)
-                        index = _numpy_where(index < 0, index + size, index)
-
-                    steps = index[1:] - index[:-1]
-                    step = steps[0]
-                    if step and not (steps - step).any():
-                        # Replace the numpy array index with a slice
-                        if step > 0:
-                            start, stop = index[0], index[-1] + 1
-                        elif step < 0:
-                            start, stop = index[0], index[-1] - 1
-
-                        if stop < 0:
-                            stop = None
-
-                        index = slice(start, stop, step)
-                        is_slice = True
-                    else:
-                        if (
-                            (step > 0 and (steps <= 0).any())
-                            or (step < 0 and (steps >= 0).any())
-                            or not step
-                        ):
-                            raise ValueError(
-                                "Bad index (not strictly monotonic): "
-                                "{}".format(index)
-                            )
-
-                #                        if reverse and step < 0:
-                #                            # The array is strictly monotonically
-                #                            # decreasing, so reverse it so that it's
-                #                            # strictly monotonically increasing.  Make
-                #                            # a note that this dimension will need
-                #                            # flipping later
-                #                            index = index[::-1]
-                #                            flip.append(i)
-                #                            step = -step
-                #
-                #                        if envelope:
-                #                            # Create an envelope slice for a parsed
-                #                            # index of a numpy array of integers
-                #                            compressed_indices.append(index)
-                #
-                #                            step = _numpy_sign(step)
-                #                            if step > 0:
-                #                                stop = index[-1] + 1
-                #                            else:
-                #                                stop = index[-1] - 1
-                #                                if stop < 0:
-                #                                    stop = None
-                #
-                #                            index = slice(index[0], stop, step)
-                #                            is_slice = True
-                else:
-                    raise IndexError(
-                        "Invalid indices {} for array with shape {}".format(
-                            parsed_indices, shape
-                        )
-                    )
 
         if is_slice:
-            #            if reverse and index.step < 0:
-            #                # If the slice step is negative, then transform
-            #                # the original slice to a new slice with a
-            #                # positive step such that the result of the new
-            #                # slice is the reverse of the result of the
-            #                # original slice.
-            #                #
-            #                # For example, if the original slice is
-            #                # slice(6,0,-2) then the new slice will be
-            #                # slice(2,7,2):
-            #                #
-            #                # >>> a = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
-            #                # >>> a[slice(6, 0, -2)]
-            #                # [6, 4, 2]
-            #                # >>> a[slice(2, 7, 2)]
-            #                # [2, 4, 6]
-            #                # a[slice(6, 0, -2)] == list(reversed(a[slice(2, 7, 2)]))
-            #                # True
-            #                start, stop, step = index.indices(size)
-            #                step *= -1
-            #                div, mod = divmod(start-stop-1, step)
-            #                div_step = div*step
-            #                start -= div_step
-            #                stop = start + div_step + 1
-            #
-            #                index = slice(start, stop, step)
-            #                flip.append(i)
-            #            # --- End: if
-
             # If step is greater than one then make sure that
             # index.stop isn't bigger than it needs to be
             if cyclic and index.step > 1:
@@ -2200,15 +2058,6 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False):
                 stop = start + div * step + 1
                 index = slice(start, stop, step)
 
-            #
-        #            if envelope:
-        #                # Create an envelope slice for a parsed
-        #                # index of a numpy array of integers
-        #                compressed_indices.append(index)
-        #                index = slice(
-        #                    start, stop, (1 if reverse else _numpy_sign(step)))
-        # --- End: if
-
         parsed_indices[i] = index
 
     #    if not (cyclic or reverse or envelope or mask):
@@ -2220,15 +2069,6 @@ def parse_indices(shape, indices, cyclic=False, numpy_indexing=False):
     if cyclic:
         out.append(roll)
 
-    #    if reverse:
-    #        out.append(flip)
-    #
-    #    if envelope:
-    #       out.append(compressed_indices)
-    #
-    #    if mask:
-    #        out.append(mask_indices)
-
     return out
 
 
diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py
index 7f60ad8ab9..35b57f7615 100644
--- a/cf/test/test_Data.py
+++ b/cf/test/test_Data.py
@@ -824,6 +824,102 @@ def test_Data___getitem__(self):
         if self.test_only and inspect.stack()[0][3] not in self.test_only:
             return
 
+        d = cf.Data(np.ma.arange(450).reshape(9, 10, 5))
+
+        for indices in (
+            Ellipsis,
+            (slice(None), slice(None)),
+            (slice(None), Ellipsis),
+            (Ellipsis, slice(None)),
+            (Ellipsis, slice(None), Ellipsis),
+        ):
+            self.assertEqual(d[indices].shape, d.shape)
+
+        for indices in (
+            ([1, 3, 4], slice(None), [2, -1]),
+            (slice(0, 6, 2), slice(None), [2, -1]),
+            (slice(0, 6, 2), slice(None), slice(2, 5, 2)),
+            (slice(0, 6, 2), list(range(10)), slice(2, 5, 2)),
+        ):
+            self.assertEqual(d[indices].shape, (3, 10, 2))
+
+        for indices in (
+            (slice(0, 6, 2), -2, [2, -1]),
+            (slice(0, 6, 2), -2, slice(2, 5, 2)),
+        ):
+            self.assertEqual(d[indices].shape, (3, 1, 2))
+
+        for indices in (
+            ([1, 3, 4], -2, [2, -1]),
+            ([4, 3, 1], -2, [2, -1]),
+            ([1, 4, 3], -2, [2, -1]),
+            ([4, 1, 4], -2, [2, -1]),
+        ):
+            self.assertEqual(d[indices].shape, (3, 1, 2))
+
+        d.__keepdims_indexing__ = False
+        for indices in (
+            ([1, 3, 4], -2, [2, -1]),
+            (slice(0, 6, 2), -2, [2, -1]),
+            (slice(0, 6, 2), -2, slice(2, 5, 2)),
+            ([1, 4, 3], -2, [2, -1]),
+            ([4, 3, 4], -2, [2, -1]),
+            ([1, 4, 4], -2, [2, -1]),
+        ):
+            self.assertEqual(d[indices].shape, (3, 2))
+
+        d.__keepdims_indexing__ = True
+
+        # Cyclic slices
+        d = cf.Data(np.ma.arange(24).reshape(3, 8))
+        d.cyclic(1)
+        self.assertTrue((d[0, :6].array == [[0, 1, 2, 3, 4, 5]]).all())
+        e = d[0, -2:4]
+        self.assertEqual(e.shape, (1, 6))
+        self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3]]).all())
+        self.assertFalse(e.cyclic())
+
+        e = d[0, -2:6]
+        self.assertEqual(e.shape, (1, 8))
+        self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3, 4, 5]]).all())
+        self.assertTrue(e.cyclic(), set([1]))
+
+        with self.assertRaises(IndexError):
+            # Cyclic slice of non-cyclic axis
+            e = d[-1:1]
+
+        d.cyclic(0)
+        e = d[-1:1, -2:-4]
+        self.assertEqual(e.shape, (2, 6))
+        self.assertTrue((e[:, 0].array == [[22], [6]]).all())
+        self.assertTrue((e[0].array == [[22, 23, 16, 17, 18, 19]]).all())
+        self.assertFalse(e.cyclic())
+
+        e = d[-1:2, -2:4]
+        self.assertEqual(e.shape, (3, 6))
+        self.assertEqual(e.cyclic(), set([0]))
+        e = d[-1:1, -2:6]
+        self.assertEqual(e.shape, (2, 8))
+        self.assertEqual(e.cyclic(), set([1]))
+        e = d[-1:2, -2:6]
+        self.assertEqual(e.shape, (3, 8))
+        self.assertEqual(e.cyclic(), set([0, 1]))
+
+        d.cyclic(0, False)
+        print(d.cyclic())
+        d.__keepdims__indexing__ = False
+        self.assertTrue((d[0, :6].array == [0, 1, 2, 3, 4, 5]).all())
+        e = d[0, -2:4]
+        print(e.cyclic())
+        self.assertEqual(e.shape, (6,))
+        self.assertTrue((e.array == [6, 7, 0, 1, 2, 3]).all())
+        self.assertFalse(e.cyclic())
+        d.__keepdims__indexing__ = True
+
+        # Ancillary masks
+        # TODODASK: Test __getitem__ with ancillary masks. Can only do
+        #           this when cf.Data.where has been daskified
+
     def test_Data___setitem__(self):
         if self.test_only and inspect.stack()[0][3] not in self.test_only:
             return

From 900832e6ee09aab65253b54ee63bb529da39bd79 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Mon, 6 Sep 2021 12:29:04 +0100
Subject: [PATCH 02/12] getitem: cyclic axes, unit tests

---
 cf/data/data.py      | 142 ++++++++++++++++++++++++++-----------------
 cf/test/test_Data.py |  39 ++++++++++--
 2 files changed, 119 insertions(+), 62 deletions(-)

diff --git a/cf/data/data.py b/cf/data/data.py
index 6d39ea30c3..92d6873154 100644
--- a/cf/data/data.py
+++ b/cf/data/data.py
@@ -919,7 +919,8 @@ def __getitem__(self, indices):
           (similar to the way vector subscripts work in Fortran). This is
           the same behaviour as indexing on a `netCDF4.Variable` object.
 
-        . seealso:: `__setitem__`, `_parse_indices`
+        . seealso:: `__setitem__`, `__keepdims_indexing__`,
+                    `__orthogonal_indexing__`
 
         :Returns:
 
@@ -956,6 +957,7 @@ def __getitem__(self, indices):
             if isinstance(arg, str) and arg == "mask":
                 auxiliary_mask = indices[1]
                 indices = indices[2:]
+
         keepdims = self.__keepdims_indexing__
         indices, roll = parse_indices(
             self.shape,
@@ -966,12 +968,13 @@ def __getitem__(self, indices):
 
         axes = self._axes
         cyclic_axes = self._cyclic
-        print ('parsed_indices=', indices)
+        # ------------------------------------------------------------
+        # Roll axes with cyclic slices
+        # ------------------------------------------------------------
         if roll:
-            # Roll axes with cyclic slices. For example, if slice(-2,
-            # 3) has been requested on a cyclic axis, then we roll
-            # that axis by two points and apply the slice(0, 5)
-            # instead.
+            # For example, if slice(-2, 3) has been requested on a
+            # cyclic axis, then we roll that axis by two points and
+            # apply the slice(0, 5) instead.
             if not cyclic_axes.issuperset([axes[i] for i in roll]):
                 raise IndexError(
                     "Can't take a cyclic slice of a non-cyclic axis"
@@ -985,7 +988,9 @@ def __getitem__(self, indices):
             new = self.copy(array=False)
             dx = self._get_dask()
 
-        # Subspace the array
+        # ------------------------------------------------------------
+        # Subspace the dask array
+        # ------------------------------------------------------------
         if self.__orthogonal_indexing__:
             # Apply 'orthogonal indexing': indices that are 1-d arrays
             # or lists subspace along each dimension
@@ -994,37 +999,40 @@ def __getitem__(self, indices):
             axes_with_list_indices = [
                 i
                 for i, x in enumerate(indices)
-                if not isinstance(x, (slice, Integral))
+                if isinstance(x, list) or getattr(x, "shape", False)
             ]
             n_axes_with_list_indices = len(axes_with_list_indices)
 
             if n_axes_with_list_indices < 2:
-                # At most one axis has a list-of-integers index so do
-                # a normal dask subspace
+                # At most one axis has a list/1-d array index so do a
+                # normal dask subspace
                 dx = dx[tuple(indices)]
-                print (indices)
             else:
-                # At least two axes have list-of-integers indices so
-                # we can't do a normal dask subspace
+                # At least two axes have list/1-d array indices so we
+                # can't do a normal dask subspace
 
-                # Subspace axes which have list-of-integer indices
+                # Subspace axes which have list/1-d array indices
                 for axis in axes_with_list_indices:
                     dx = da.take(dx, indices[axis], axis=axis)
 
                 if n_axes_with_list_indices < len(indices):
-                    # Subspace axes which didn't have list-of-integer
-                    # indices. (Do this after subspacing axes which
-                    # have list-of-integer indices in case
+                    # Subspace axes which don't have list/1-d array
+                    # indices. (Do this after subspacing axes which do
+                    # have list/1-d array indices, in case
                     # __keepdims_indexing__ is False.)
                     slice_indices = [
                         slice(None) if i in axes_with_list_indices else x
                         for i, x in enumerate(indices)
                     ]
                     dx = dx[tuple(slice_indices)]
-
         else:
             raise ValueError("Orthogonal indexing is currently required.")
 
+        # ------------------------------------------------------------
+        # Set the subspaced dask array
+        # ------------------------------------------------------------
+        new._set_dask(dx)
+
         # ------------------------------------------------------------
         # Note which axes have not been dropped from the result
         # (i.e. those indexed with non-integral indices)
@@ -1033,10 +1041,11 @@ def __getitem__(self, indices):
             axes_with_non_integral_indices = axes
             shape = self.shape
         else:
+            # Dropped axes can no longer be cyclic
             axes_with_non_integral_indices = [
                 axis
                 for axis, x in zip(axes, indices)
-                if not isinstance(x, Integral)
+                if not (isinstance(x, Integral) or getattr(x, "shape", True))
             ]
             if len(axes_with_non_integral_indices) < len(indices):
                 # Never change the _axes attribute in-place
@@ -1044,38 +1053,41 @@ def __getitem__(self, indices):
 
                 if cyclic_axes:
                     cyclic_axes = set(
-                        [axis
-                         for axis in cyclic_axes
-                         if axis in axes_with_non_integral_indices]
+                        [
+                            axis
+                            for axis in cyclic_axes
+                            if axis in axes_with_non_integral_indices
+                        ]
                     )
-                    shape = [n
-                             for n, axis in zip(self.shape, axes)
-                             if axis in axes_with_non_integral_indices]
-
-        # Set the subspaced dask array
-        new._set_dask(dx)
-
-        # ------------------------------------------------------------
-        # Apply auxiliary masks
-        # ------------------------------------------------------------
-        for mask in auxiliary_mask:
-            new.where(mask, cf_masked, inplace=True)
+                    if cyclic_axes:
+                        shape = [
+                            n
+                            for n, axis in zip(self.shape, axes)
+                            if axis in axes_with_non_integral_indices
+                        ]
 
         # ------------------------------------------------------------
-        # Cyclic axes
+        # Cyclic axes that have been reduced in size are no longer
+        # considered to be cyclic
         # ------------------------------------------------------------
         if cyclic_axes:
-            # Cyclic axes that have been reduced in size are no longer
-            # considered to be cyclic
-            x = [axis
-                 for axis, n0, n1 in zip(axes_with_non_integral_indices,
-                                         shape,
-                                         new.shape)
-                 if n1 != n0 and axis in cyclic_axes]
+            x = [
+                axis
+                for axis, n0, n1 in zip(
+                    axes_with_non_integral_indices, shape, new.shape
+                )
+                if n1 != n0 and axis in cyclic_axes
+            ]
             if x:
                 # Never change the _cyclic attribute in-place
                 new._cyclic = cyclic_axes.difference(x)
 
+        # ------------------------------------------------------------
+        # Apply auxiliary masks
+        # ------------------------------------------------------------
+        for mask in auxiliary_mask:
+            new.where(mask, cf_masked, None, inplace=True)
+
         return new
 
     @daskified(1)
@@ -1187,14 +1199,23 @@ def __orthogonal_indexing__(self):
         """Flag to indicate that orthogonal indexing is supported.
 
         Always True, indicating that 'orthogonal indexing' is
-        applied. This means that indices that are 1-d arrays or lists
-        subspace along each dimension independently. This behaviour is
-        similar to Fortran, but different to `numpy`.
+        applied. This means that when indices are 1-d arrays or lists
+        then they subspace along each dimension independently. This
+        behaviour is similar to Fortran, but different to `numpy`.
 
         .. versionadded:: TODODASK
 
-        .. seealso:: __keepdims_indexing__,
-                     netCDF4.Variable.__orthogonal_indexing__
+        .. seealso:: `__keepdims_indexing__`, `__getitem__`,
+                     `__setitem__`,
+                     `netCDF4.Variable.__orthogonal_indexing__`
+
+        **Examples:**
+
+        >>> d = cf.Data([[1, 2, 3], [4, 5, 6]])
+        >>> d[[0], [0, 2]].shape
+        (1, 2)
+        >>> d[[0, 1], [0, 2]].shape
+        (2, 2)
 
         """
         return True
@@ -1204,27 +1225,36 @@ def __orthogonal_indexing__(self):
     def __keepdims_indexing__(self):
         """Flag to indicate whether orthogonal indexing is supported.
 
-        If True then providing a single integer as a single-axis index
-        does *not* reduce the number of array dimensions by 1. This
-        behaviour is different to `numpy`.
+        If set to True (the default) then providing a single integer
+        as a single-axis index does *not* reduce the number of array
+        dimensions by 1. This behaviour is different to `numpy`.
 
-        If False then providing a single integer as a single-axis
-        index reduces the number of array dimensions by 1. This
-        behaviour is the same as `numpy`.
+        If set to False then providing a single integer as a
+        single-axis index reduces the number of array dimensions by
+        1. This behaviour is the same as `numpy`.
 
         .. versionadded:: TODODASK
 
-        .. seealso:: __orthogonal_indexing__
+        .. seealso:: `__orthogonal_indexing__`, `__getitem__`,
+                     `__setitem__`
 
         **Examples:**
 
-        >>> d = cf.Data([[1, 2], [3, 4]])
+        >>> d = cf.Data([[1, 2, 3], [4, 5, 6]])
         >>> d.__keepdims_indexing__ = True
         >>> d[0].shape
-        (1, 2)
+        (1, 3)
+        >>> d[:, 1].shape
+        (2, 1)
+        >>> d[0, 1].shape
+        (1, 1)
         >>> d.__keepdims_indexing__ = False
         >>> d[0].shape
+        (3,)
+        >>> d[:, 1].shape
         (2,)
+        >>> d[0, 1].shape
+        ()
 
         """
         return self._custom.get("__keepdims_indexing__", True)
diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py
index 35b57f7615..c2da15cb7a 100644
--- a/cf/test/test_Data.py
+++ b/cf/test/test_Data.py
@@ -820,7 +820,7 @@ def test_Data_squeeze_insert_dimension(self):
         a = f.array
         self.assertTrue(np.allclose(a, array))
 
-    def test_Data___getitem__(self):
+    def test_Data__getitem__(self):
         if self.test_only and inspect.stack()[0][3] not in self.test_only:
             return
 
@@ -858,6 +858,7 @@ def test_Data___getitem__(self):
             self.assertEqual(d[indices].shape, (3, 1, 2))
 
         d.__keepdims_indexing__ = False
+        self.assertFalse(d.__keepdims_indexing__)
         for indices in (
             ([1, 3, 4], -2, [2, -1]),
             (slice(0, 6, 2), -2, [2, -1]),
@@ -868,13 +869,20 @@ def test_Data___getitem__(self):
         ):
             self.assertEqual(d[indices].shape, (3, 2))
 
+        self.assertFalse(d.__keepdims_indexing__)
         d.__keepdims_indexing__ = True
+        self.assertTrue(d.__keepdims_indexing__)
+
+        d = cf.Data(np.ma.arange(24).reshape(3, 8))
+        e = d[0, 2:4]
+        print(e.cyclic(), e.shape)
 
         # Cyclic slices
         d = cf.Data(np.ma.arange(24).reshape(3, 8))
         d.cyclic(1)
         self.assertTrue((d[0, :6].array == [[0, 1, 2, 3, 4, 5]]).all())
         e = d[0, -2:4]
+        print(e.cyclic(), e.shape)
         self.assertEqual(e.shape, (1, 6))
         self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3]]).all())
         self.assertFalse(e.cyclic())
@@ -906,17 +914,36 @@ def test_Data___getitem__(self):
         self.assertEqual(e.cyclic(), set([0, 1]))
 
         d.cyclic(0, False)
-        print(d.cyclic())
-        d.__keepdims__indexing__ = False
-        self.assertTrue((d[0, :6].array == [0, 1, 2, 3, 4, 5]).all())
+        d.__keepdims_indexing__ = False
+        e = d[0, :6]
+        self.assertFalse(e.__keepdims_indexing__)
+        self.assertEqual(e.shape, (6,))
+        self.assertTrue((e.array == [0, 1, 2, 3, 4, 5]).all())
+        print("----")
         e = d[0, -2:4]
-        print(e.cyclic())
         self.assertEqual(e.shape, (6,))
         self.assertTrue((e.array == [6, 7, 0, 1, 2, 3]).all())
         self.assertFalse(e.cyclic())
-        d.__keepdims__indexing__ = True
+        d.__keepdims_indexing__ = True
+
+        # Keepdims indexing
+        d = cf.Data([[1, 2, 3], [4, 5, 6]])
+        self.assertEqual(d[0].shape, (1, 3))
+        self.assertEqual(d[:, 1].shape, (2, 1))
+        self.assertEqual(d[0, 1].shape, (1, 1))
+        d.__keepdims_indexing__ = False
+        self.assertEqual(d[0].shape, (3,))
+        self.assertEqual(d[:, 1].shape, (2,))
+        self.assertEqual(d[0, 1].shape, ())
+        d.__keepdims_indexing__ = True
+
+        # Orthogonal indexing
+        self.assertEqual(d[[0], [0, 2]].shape, (1, 2))
+        self.assertEqual(d[[0, 1], [0, 2]].shape, (2, 2))
+        self.assertEqual(d[[0, 1], [2]].shape, (2, 1))
 
         # Ancillary masks
+        #
         # TODODASK: Test __getitem__ with ancillary masks. Can only do
         #           this when cf.Data.where has been daskified
 

From 336bd687385ef4a4698f87d817387ce850c02b9a Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Mon, 6 Sep 2021 12:32:20 +0100
Subject: [PATCH 03/12] remove debugging print statements

---
 cf/functions.py      | 1 -
 cf/test/test_Data.py | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/cf/functions.py b/cf/functions.py
index d9c46e1872..a6f16a2cb1 100644
--- a/cf/functions.py
+++ b/cf/functions.py
@@ -2045,7 +2045,6 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True):
                 index += size
 
             if keepdims:
-                print("CONVERT")
                 index = slice(index, index + 1, 1)
                 is_slice = True
 
diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py
index c2da15cb7a..eb7a052b5c 100644
--- a/cf/test/test_Data.py
+++ b/cf/test/test_Data.py
@@ -875,14 +875,12 @@ def test_Data__getitem__(self):
 
         d = cf.Data(np.ma.arange(24).reshape(3, 8))
         e = d[0, 2:4]
-        print(e.cyclic(), e.shape)
 
         # Cyclic slices
         d = cf.Data(np.ma.arange(24).reshape(3, 8))
         d.cyclic(1)
         self.assertTrue((d[0, :6].array == [[0, 1, 2, 3, 4, 5]]).all())
         e = d[0, -2:4]
-        print(e.cyclic(), e.shape)
         self.assertEqual(e.shape, (1, 6))
         self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3]]).all())
         self.assertFalse(e.cyclic())
@@ -919,7 +917,6 @@ def test_Data__getitem__(self):
         self.assertFalse(e.__keepdims_indexing__)
         self.assertEqual(e.shape, (6,))
         self.assertTrue((e.array == [0, 1, 2, 3, 4, 5]).all())
-        print("----")
         e = d[0, -2:4]
         self.assertEqual(e.shape, (6,))
         self.assertTrue((e.array == [6, 7, 0, 1, 2, 3]).all())

From 1bfe9fe127199cd838581d84b871005be9845aed Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Mon, 6 Sep 2021 17:26:29 +0100
Subject: [PATCH 04/12] setitem

---
 cf/data/data.py      | 111 ++++++++++++++++++++++---------------------
 cf/data/utils.py     |  75 +++++++++++++++++++++++++++--
 cf/functions.py      |  32 +++++--------
 cf/test/test_Data.py |  36 ++++++++++----
 4 files changed, 166 insertions(+), 88 deletions(-)

diff --git a/cf/data/data.py b/cf/data/data.py
index 92d6873154..4a3f8a74cf 100644
--- a/cf/data/data.py
+++ b/cf/data/data.py
@@ -105,6 +105,7 @@
 from .partition import Partition
 from .partitionmatrix import PartitionMatrix
 from .utils import (  # is_small,; is_very_small,
+    conform_units,
     convert_to_datetime,
     convert_to_reftime,
     dask_compatible,
@@ -968,6 +969,7 @@ def __getitem__(self, indices):
 
         axes = self._axes
         cyclic_axes = self._cyclic
+
         # ------------------------------------------------------------
         # Roll axes with cyclic slices
         # ------------------------------------------------------------
@@ -1102,6 +1104,12 @@ def __setitem__(self, indices, value):
         a subspace. See `__getitem__` for details on how to define
         subspace of the data array.
 
+        .. note:: Currently at most one dimension's assignment index
+                  may be a 1-d array of integers or booleans. This is
+                  is different to `__getitem__`, which applies
+                  'orthogonal indexing' when multiple indices of 1-d
+                  array of integers or booleans are present.
+
         **Missing data**
 
         The treatment of missing data elements during assignment to a
@@ -1116,77 +1124,69 @@ def __setitem__(self, indices, value):
         the `cf.masked` constant or by assignment to a value which
         contains masked elements.
 
-        .. seealso:: `cf.masked`, `hardmask`, `where`
+        .. seealso:: `__getitem__`, `cf.masked`, `hardmask`, `where`
 
         **Examples:**
 
         """
-        # TODODASK - sort out the "numpy" environment
-
         indices, roll = parse_indices(
             self.shape, indices, cyclic=True, keepdims=True
         )
         indices = tuple(indices)
 
         axes_with_list_indices = [
-            True for x in indices if not isinstance(x, slice)
+            i
+            for i, x in enumerate(indices)
+            if isinstance(x, list) or getattr(x, "shape", False)
         ]
         if len(axes_with_list_indices) > 1:
-            raise IndexError(
+            raise NotImplementedError(
                 "Currently limited to at most one dimension's assignment "
                 "index being a 1-d array of integers or booleans. "
                 f"Got: {indices}"
             )
+            # TODODASK: The inherited algorithm that does assignment
+            #           for multiple list/1-d array indices
+            #           (cfdm.Data._set_subspace) won't work when the
+            #           1-d array is a dask array because the array
+            #           may need to be computed at __setitem__
+            #           runtime, which is not desirable. Until this
+            #           can be fixed, it's easiest to disallow this
+            #           case, that was allowed pre-dask.
 
+        # ------------------------------------------------------------
+        # Roll axes with cyclic slices
+        # ------------------------------------------------------------
         if roll:
-            # Roll axes with cyclic slices. For example, if assigning
-            # to slice(-2, 3) has been requested on a cyclic axis (and
-            # we're not using numpy indexing), then we roll that axis
-            # by two points and assign to slice(0, 5) instead. The
-            # axis is then unrolled by two points afer the assignment
-            # has been made.
+            # For example, if assigning to slice(-2, 3) has been
+            # requested on a cyclic axis (and we're not using numpy
+            # indexing), then we roll that axis by two points and
+            # assign to slice(0, 5) instead. The axis is then unrolled
+            # by two points afer the assignment has been made.
             axes = self._axes
-            if self._cyclic.intersection([axes[i] for i in roll]):
+            if not self._cyclic.issuperset([axes[i] for i in roll]):
                 raise IndexError(
                     "Can't do a cyclic assignment to a non-cyclic axis"
                 )
 
             roll_axes = tuple(roll.keys())
             shifts = tuple(roll.values())
-            self.roll(axis=roll_axes, shift=shifts, inplace=True)
-
-        # TODODASK: multiple lists
+            self.roll(shift=shifts, axis=roll_axes, inplace=True)
 
         # Make sure that the units of value are the same as self
-        try:
-            value_units = value.Units
-        except AttributeError:
-            pass
-        else:
-            self_units = self.Units
-            if value_units.equivalent(self_units):
-                if value_units != self_units:
-                    value = value.copy()
-                    value.Units = self.Units
-            elif value_units and self_units:
-                raise ValueError(
-                    f"Can't assign values with units {value_units!r} "
-                    f"to data with units {self_units!r}"
-                )
-
-        # Set the mask hardness, in case any previous operations have
-        # inadvertently changed it.
-        self._set_mask_hardness()
+        value = conform_units(value, self.Units)
 
         # Do the assignment
         dx = self._get_dask()
         dx[indices] = dask_compatible(value)
 
+        # ------------------------------------------------------------
+        # Unroll any axes that were rolled to enable a cyclic
+        # assignment
+        # ------------------------------------------------------------
         if roll:
-            # Unroll any axes that were rolled to enable a cyclic
-            # assignment
             shifts = [-shift for shift in shifts]
-            self.roll(axis=roll_axes, shift=shifts, inplace=True)
+            self.roll(shift=shifts, axis=roll_axes, inplace=True)
 
         return
 
@@ -5847,8 +5847,6 @@ def hardmask(self):
         changes the `hardmask` attribute to `False`. The mask can be
         re-hardened with `harden_mask` method.
 
-        The following operations
-
         .. seealso:: `harden_mask`, `soften_mask`, `where`
 
         **Examples:**
@@ -5897,7 +5895,7 @@ def is_masked(self):
 
         **Performance**
 
-        `is_masked` causes all delayed operations to be computed.
+        `is_masked` causes all delayed operations to be executed.
 
         **Examples:**
 
@@ -5909,6 +5907,13 @@ def is_masked(self):
         True
 
         """
+        # TODODASK: (General point) When we run something that
+        #           executes all of the lazy operations (like
+        #           is_masked), should/could we replace the dask array
+        #           of self with a "persisted" version of the computed
+        #           data? If we did this, we would want to have the
+        #           ability to cache persisted chunks to disk, as they
+        #           came into being on each thread.
 
         def is_masked(a):
             out = np.ma.is_masked(a)
@@ -5916,7 +5921,7 @@ def is_masked(a):
 
         dx = self._get_dask()
 
-        out_ind = tuple(dx.ndim)
+        out_ind = tuple(range(dx.ndim))
         dx_ind = out_ind
 
         dx = da.blockwise(
@@ -6111,7 +6116,6 @@ def array(self):
                 a.harden_mask()
             else:
                 a.soften_mask()
-        # --- End: if
 
         return a
 
@@ -12660,6 +12664,14 @@ def roll(self, axis, shift, inplace=False, i=False):
                 *Parameter example:*
                   Convolve the last axis: ``axis=-1``.
 
+            shift: `int`, or `tuple` of `int`
+                The number of places by which elements are shifted.
+                If a `tuple`, then *axis* must be a tuple of the same
+                size, and each of the given axes is shifted by the
+                corresponding number. If an `int` while *axis* is a
+                tuple of `int`, then the same value is used for all
+                given axes.
+
             {{inplace: `bool`, optional}}
 
             {{i: deprecated at version 3.0.0}}
@@ -12669,21 +12681,12 @@ def roll(self, axis, shift, inplace=False, i=False):
             `Data` or `None`
 
         """
-        d = _inplace_enabled_define_and_cleanup(self)
-
-        if isinstance(shift, int):
-            shift = (shift,)
+        # TODODASK - consider matching the numpy/dask api: "shift, axis="
 
-        axes = self._parse_axes(axis)
-        if len(axes) != len(shift):
-            raise ValueError("TODODASK")
-
-        if not shift:
-            # Null roll
-            return d
+        d = _inplace_enabled_define_and_cleanup(self)
 
         dx = d._get_dask()
-        dx = da.roll(dx, axis=axes, shift=shift)
+        dx = da.roll(dx, shift, axis=axis)
         d._set_dask(dx)
 
         return d
diff --git a/cf/data/utils.py b/cf/data/utils.py
index d7a150eba1..ca1584ba85 100644
--- a/cf/data/utils.py
+++ b/cf/data/utils.py
@@ -1,5 +1,5 @@
 """General functions useful for `Data` functionality."""
-from functools import partial
+from functools import lru_cache, partial
 from itertools import product
 
 import numpy as np
@@ -93,7 +93,6 @@ def convert_to_reftime(array, units, first_value=None):
                     )
             else:
                 d_calendar = x_calendar
-        # --- End: if
 
         if not units:
             # Set the units to something that is (hopefully)
@@ -210,12 +209,13 @@ def _get_calendar(x):
     return set(cals.tolist())
 
 
+@lru_cache(maxsize=32)
 def new_axis_identifier(existing_axes=(), basename="dim"):
-    """Return a new, unique axis identifiers.
+    """Return a new, unique axis identifier.
 
     The name is arbitrary and has no semantic meaning.
 
-    .. versionadded:: 4.0.0
+    .. versionadded:: TODODASK
 
     :Parameters:
 
@@ -365,6 +365,71 @@ def dask_compatible(a):
 
     """
     try:
-        return a.data._get_data()
+        return a.data._get_dask()
     except AttributeError:
         return a
+
+
+def conform_units(value, units):
+    """Conform units.
+
+    If *value* has units defined by its `Units` attribute then
+
+    * if the value units are equal to *units* then *value* is returned
+      unchanged;
+
+    * if the value units are equivalent to *units* then a copy of
+      *value* with the same units as data* is returned;
+
+    * if the value units are not equivalent to *units* then an
+      exception is raised.
+
+    In all other cases *value* is returned unchanged.
+
+    .. versionadded:: TODODASK
+
+    :Parameters:
+
+        value:
+            The value whose units are to be conformed to *units*.
+
+        units: `Units`
+            The units to conform to.
+
+    **Examples:**
+
+    >>> conform_units(1, cf.Units('metres'))
+    1
+    >>> conform_units([1, 2, 3], cf.Units('metres'))
+    [1, 2, 3]
+    >>> import numpy
+    >>> conform_units(numpy.array([1, 2, 3]), cf.Units('metres'))
+    array([1, 2, 3])
+    >>> conform_units('string', cf.Units('metres'))
+    'string'
+    >>> d = cf.Data([1, 2] , 'm')
+    >>> conform_units(d, cf.Units('metres'))
+    <CF Data(2): [1, 2] m>
+    >>> d = cf.Data([1, 2] , 'km')
+    >>> conform_units(d, cf.Units('metres'))
+    <CF Data(2): [1000.0, 2000.0] metres>
+    >>> conform_units(d, cf.Units('s'))
+        ...
+    ValueError: Units <Units: km> are incompatible with units <Units: s>
+
+    """
+    try:
+        value_units = value.Units
+    except AttributeError:
+        pass
+    else:
+        if value_units.equivalent(units):
+            if value_units != units:
+                value = value.copy()
+                value.Units = units
+        elif value_units and units:
+            raise ValueError(
+                f"Units {value_units!r} are incompatible with units {units!r}"
+            )
+
+    return value
diff --git a/cf/functions.py b/cf/functions.py
index a6f16a2cb1..c8e48540e0 100644
--- a/cf/functions.py
+++ b/cf/functions.py
@@ -1886,33 +1886,31 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True):
 
         indices: `tuple` (not a `list`!)
 
+        keepdims: `bool`, optional
+            If True then an integral index is converted to a
+            slice. For instance, ``3`` would become ``slice(3, 4)``.
+
     :Returns:
 
         `list` [, `dict`]
+            The parsed indices. If *cyclic* is True the a dictionary
+            is also returned that contains the parameters needed to
+            interpret any cyclic slices.
 
     **Examples:**
 
     >>> cf.parse_indices((5, 8), ([1, 2, 4, 6],))
     [array([1, 2, 4, 6]), slice(0, 8, 1)]
-    >>> cf.parse_indices((5, 8), ([2, 4, 6],))
-    [slice(2, 7, 2), slice(0, 8, 1)]
+    >>> cf.parse_indices((5, 8), (Ellipsis, [2, 4, 6]))
+    [slice(0, 5, 1), slice(2, 7, 2)]
 
     """
     parsed_indices = []
     roll = {}
-    flip = []
-    compressed_indices = []
-    mask_indices = []
 
     if not isinstance(indices, tuple):
         indices = (indices,)
 
-    #    if mask and indices:
-    #        arg0 = indices[0]
-    #        if isinstance(arg0, str) and arg0 == 'mask':
-    #            mask_indices = indices[1]
-    #            indices = indices[2:]
-
     # Initialize the list of parsed indices as the input indices with any
     # Ellipsis objects expanded
     length = len(indices)
@@ -1933,9 +1931,7 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True):
 
     if ndim and len_parsed_indices > ndim:
         raise IndexError(
-            "Invalid indices {} for array with shape {}".format(
-                parsed_indices, shape
-            )
+            f"Invalid indices {parsed_indices} for array with shape {shape}"
         )
 
     if len_parsed_indices < ndim:
@@ -2059,16 +2055,10 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True):
 
         parsed_indices[i] = index
 
-    #    if not (cyclic or reverse or envelope or mask):
     if not cyclic:
         return parsed_indices
 
-    out = [parsed_indices]
-
-    if cyclic:
-        out.append(roll)
-
-    return out
+    return parsed_indices, roll
 
 
 def get_subspace(array, indices):
diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py
index eb7a052b5c..b0c8afac6a 100644
--- a/cf/test/test_Data.py
+++ b/cf/test/test_Data.py
@@ -955,11 +955,10 @@ def test_Data___setitem__(self):
             else:
                 a.soften_mask()
 
-            d = cf.Data(a, "m", chunks=(2, 2), hardmask=hardmask)
+            d = cf.Data(a.copy(), "metres", hardmask=hardmask)
 
-            # Scalar assignment
-            a[:, 1] = np.ma.masked
-            d[:, 1] = np.ma.masked
+            a[:, 1] = cf.masked
+            d[:, 1] = cf.masked
 
             a[0, 2] = -6
             d[0, 2] = -6
@@ -982,16 +981,37 @@ def test_Data___setitem__(self):
             a[8, [8, 6, 5]] = -5
             d[8, [8, 6, 5]] = -5
 
-            self.assertTrue((d.array == a).all())
-            self.assertTrue((d.array.mask == a.mask).all())
+            a[...] = -a
+            d[...] = -d
 
-            # Non-scalar assignment (TODODASK - add more tests)
             a[0] = a[2]
-            d[0] = a[2]
+            d[0] = d[2]
 
             self.assertTrue((d.array == a).all())
             self.assertTrue((d.array.mask == a.mask).all())
 
+        # Units
+        a = np.ma.arange(90).reshape(9, 10)
+        d = cf.Data(a, "metres")
+        d[...] = cf.Data(a * 100, "cm")
+        self.assertTrue((d.array == a).all())
+        self.assertTrue((d.array.mask == a.mask).all())
+
+        # Cyclic
+        d.cyclic(1)
+        self.assertTrue((d[0].array == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).all())
+        d[0, -1:1] = [-99, -1]
+        self.assertTrue(
+            (d[0].array == [-1, 1, 2, 3, 4, 5, 6, 7, 8, -99]).all()
+        )
+        self.assertEqual(d.cyclic(), set([1]))
+
+        with self.assertRaises(NotImplementedError):
+            d[[1, 2], [0, 4, 1]] = 9
+
+        with self.assertRaises(NotImplementedError):
+            d[[1], [0, 4, 1]] = 9
+
     @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'")
     def test_Data_outerproduct(self):
         if self.test_only and inspect.stack()[0][3] not in self.test_only:

From 8f07e1fbdeab7b3434b1ef2c2994cba52fbbec3b Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Wed, 8 Sep 2021 18:26:48 +0100
Subject: [PATCH 05/12] hardmask, unknown shape

---
 cf/data/data.py      | 392 ++++++++++++++++++++++++++-----------------
 cf/field.py          |  10 +-
 cf/functions.py      |  60 ++-----
 cf/test/test_Data.py |  26 ++-
 4 files changed, 274 insertions(+), 214 deletions(-)

diff --git a/cf/data/data.py b/cf/data/data.py
index 4a3f8a74cf..a18cdb243e 100644
--- a/cf/data/data.py
+++ b/cf/data/data.py
@@ -1,4 +1,5 @@
 import logging
+import math
 import operator
 from functools import partial, reduce, wraps
 from itertools import product
@@ -113,6 +114,11 @@
     new_axis_identifier,
 )
 
+# from .chunk_utils import (  # is_small,; is_very_small,
+#    harden_mask_chunk,
+#   soften_mask_chunk,
+# )
+
 # from dask.array import Array
 
 
@@ -300,8 +306,7 @@ def __init__(
 
             array: optional
                 The array of values. May be any scalar or array-like
-                object, including another `Data` instance. Ignored if the
-                *source* parameter is set.
+                object, including another `Data` instance.
 
                 *Parameter example:*
                   ``array=[34.6]``
@@ -314,8 +319,7 @@ def __init__(
 
             units: `str` or `Units`, optional
                 The physical units of the data. if a `Units` object is
-                provided then this an also set the calendar. Ignored if
-                the *source* parameter is set.
+                provided then this an also set the calendar.
 
                 The units (without the calendar) may also be set after
                 initialisation with the `set_units` method.
@@ -327,8 +331,7 @@ def __init__(
                   ``units='days since 2018-12-01'``
 
             calendar: `str`, optional
-                The calendar for reference time units. Ignored if the
-                *source* parameter is set.
+                The calendar for reference time units.
 
                 The calendar may also be set after initialisation with the
                 `set_calendar` method.
@@ -340,8 +343,7 @@ def __init__(
                 The fill value of the data. By default, or if set to
                 `None`, the `numpy` fill value appropriate to the array's
                 data-type will be used (see
-                `numpy.ma.default_fill_value`). Ignored if the *source*
-                parameter is set.
+                `numpy.ma.default_fill_value`).
 
                 The fill value may also be set after initialisation with
                 the `set_fill_value` method.
@@ -351,7 +353,8 @@ def __init__(
 
             dtype: data-type, optional
                 The desired data-type for the data. By default the
-                data-type will be inferred form the *array* parameter.
+                data-type will be inferred form the *array*
+                parameter.
 
                 The data-type may also be set after initialisation with
                 the `dtype` attribute.
@@ -369,11 +372,12 @@ def __init__(
 
             mask: optional
                 Apply this mask to the data given by the *array*
-                parameter. By default, or if *mask* is `None`, no mask is
-                applied. May be any scalar or array-like object (such as a
-                `list`, `numpy` array or `Data` instance) that is
-                broadcastable to the shape of *array*. Masking will be
-                carried out where the mask elements evaluate to `True`.
+                parameter. By default, or if *mask* is `None`, no mask
+                is applied. May be any scalar or array-like object
+                (such as a `list`, `numpy` array or `Data` instance)
+                that is broadcastable to the shape of *array*. Masking
+                will be carried out where the mask elements evaluate
+                to `True`.
 
                 This mask will applied in addition to any mask already
                 defined by the *array* parameter.
@@ -381,8 +385,10 @@ def __init__(
                 .. versionadded:: 3.0.5
 
             source: optional
-                Initialize the array, units, calendar and fill value from
-                those of *source*.
+                Initialize the data values and metadata (such as
+                units, mask hardness, etc.) from the data of
+                *source*. All other arguments, with the exception of
+                *copy*, are ignored.
 
             hardmask: `bool`, optional
                 If False then the mask is soft. By default the mask is
@@ -460,7 +466,6 @@ def __init__(
                     "Can't set the 'source' and 'loads' parameters "
                     "at the same time"
                 )
-        # --- End: if
 
         if source is not None:
             super().__init__(source=source, _use_array=_use_array)
@@ -474,6 +479,11 @@ def __init__(
             else:
                 self._del_dask(None)
 
+            # Note the mask hardness. It is safe to assume that if a
+            # dask array has been set, then it's mask hardness will be
+            # already baked into each chunk.
+            self._hardmask = getattr(source, "hardmask", _DEFAULT_HARDMASK)
+
             return
 
         super().__init__(
@@ -483,7 +493,8 @@ def __init__(
         # Create the _HDF_chunks attribute: defines HDF chunking when
         # writing to disk.
         #
-        # Never change the _HDF_chunks attribute  in-place.
+        # Never change the value of the _HDF_chunks attribute
+        # in-place.
         self._HDF_chunks = None
 
         if loadd is not None:
@@ -498,7 +509,12 @@ def __init__(
         units = Units(units, calendar=calendar)
         self._Units = units
 
-        # Set the mask hardness
+        # Note the mask hardness. This only records what we want the
+        # mask hardness to be, and is required in case this
+        # initialization does not set an array (i.e. array is None or
+        # _use_array is False). If a dask array is actually set later
+        # on, then the mask hardness will be set properly, i.e. it
+        # will be baked into each chunk.
         self._hardmask = hardmask
 
         if array is None:
@@ -515,7 +531,7 @@ def __init__(
         # is removed from _axes then it must also be removed from
         # _cyclic.
         #
-        # Never change the _cyclic attribute in-place.
+        # Never change the value of the _cyclic attribute in-place.
         self._cyclic = _empty_set
 
         # Create the _axes attribute: an ordered sequence of unique
@@ -560,7 +576,6 @@ def __init__(
             first_value = first_non_missing_value(array)
             if first_value is not None:
                 dt = hasattr(first_value, "timetuple")
-        # --- End: if
 
         # Convert string or object date-times to floating point
         # reference times
@@ -572,6 +587,9 @@ def __init__(
         # Store the dask array
         self._set_dask(array, delete_source=False)
 
+        # Set the mask hardness on each chunk.
+        self.hardmask = hardmask
+
         # Override the data type
         if dtype is not None:
             self.dtype = dtype
@@ -579,10 +597,6 @@ def __init__(
         # Apply a mask
         if mask is not None:
             self.where(mask, cf_masked, inplace=True)
-        else:
-            # Set the mask hardness (which is otherwise set by the
-            # 'where' method)
-            self._set_mask_hardness()
 
     @property
     def dask_array(self):
@@ -622,17 +636,17 @@ def __contains__(self, value):
 
         **Performance**
 
-        All delayed operations are computed, and the operation does
-        not short-circuit once the first occurrence is found.
+        All delayed operations are exectued, and there is no
+        short-circuit once the first occurrence is found.
 
         **Examples:**
 
-        >>> d = Data([[0.0, 1,  2], [3, 4, 5]], 'm')
+        >>> d = cf.Data([[0.0, 1,  2], [3, 4, 5]], 'm')
         >>> 4 in d
         True
-        >>> Data(3) in d
+        >>> cf.Data(3) in d
         True
-        >>> Data([2.5], units='2 m') in d
+        >>> cf.Data([2.5], units='2 m') in d
         True
         >>> [[2]] in d
         True
@@ -661,7 +675,7 @@ def contains_chunk(a, value):
 
         dx = self._get_dask()
 
-        out_ind = tuple(dx.ndim)
+        out_ind = tuple(range(dx.ndim))
         dx_ind = out_ind
 
         dx = da.blockwise(
@@ -920,6 +934,11 @@ def __getitem__(self, indices):
           (similar to the way vector subscripts work in Fortran). This is
           the same behaviour as indexing on a `netCDF4.Variable` object.
 
+        **Performance**
+
+        If the shape of the data is unknown then it is calculated
+        immediately by exectuting all delayed operations.
+
         . seealso:: `__setitem__`, `__keepdims_indexing__`,
                     `__orthogonal_indexing__`
 
@@ -959,9 +978,11 @@ def __getitem__(self, indices):
                 auxiliary_mask = indices[1]
                 indices = indices[2:]
 
+        shape = self.shape
         keepdims = self.__keepdims_indexing__
+
         indices, roll = parse_indices(
-            self.shape,
+            shape,
             indices,
             cyclic=True,
             keepdims=keepdims,
@@ -1036,37 +1057,24 @@ def __getitem__(self, indices):
         new._set_dask(dx)
 
         # ------------------------------------------------------------
-        # Note which axes have not been dropped from the result
-        # (i.e. those indexed with non-integral indices)
+        # Set the axis identifiers for the subspace
         # ------------------------------------------------------------
+        shape0 = shape
         if keepdims:
-            axes_with_non_integral_indices = axes
-            shape = self.shape
+            new_axes = axes
         else:
-            # Dropped axes can no longer be cyclic
-            axes_with_non_integral_indices = [
+            new_axes = [
                 axis
                 for axis, x in zip(axes, indices)
-                if not (isinstance(x, Integral) or getattr(x, "shape", True))
+                if not isinstance(x, Integral) and getattr(x, "shape", True)
             ]
-            if len(axes_with_non_integral_indices) < len(indices):
-                # Never change the _axes attribute in-place
-                new._axes = axes_with_non_integral_indices
-
+            if new_axes != axes:
+                new._axes = new_axes
+                cyclic_axes = new._cyclic
                 if cyclic_axes:
-                    cyclic_axes = set(
-                        [
-                            axis
-                            for axis in cyclic_axes
-                            if axis in axes_with_non_integral_indices
-                        ]
-                    )
-                    if cyclic_axes:
-                        shape = [
-                            n
-                            for n, axis in zip(self.shape, axes)
-                            if axis in axes_with_non_integral_indices
-                        ]
+                    shape0 = [
+                        n for n, axis in zip(shape, axes) if axis in new_axes
+                    ]
 
         # ------------------------------------------------------------
         # Cyclic axes that have been reduced in size are no longer
@@ -1075,13 +1083,12 @@ def __getitem__(self, indices):
         if cyclic_axes:
             x = [
                 axis
-                for axis, n0, n1 in zip(
-                    axes_with_non_integral_indices, shape, new.shape
-                )
-                if n1 != n0 and axis in cyclic_axes
+                for axis, n0, n1 in zip(new_axes, shape0, new.shape)
+                if axis in cyclic_axes and n0 != n1
             ]
             if x:
-                # Never change the _cyclic attribute in-place
+                # Never change the value of the _cyclic attribute
+                # in-place
                 new._cyclic = cyclic_axes.difference(x)
 
         # ------------------------------------------------------------
@@ -1124,6 +1131,11 @@ def __setitem__(self, indices, value):
         the `cf.masked` constant or by assignment to a value which
         contains masked elements.
 
+        **Performance**
+
+        If the shape of the data is unknown then it is calculated
+        immediately by executing all delayed operations.
+
         .. seealso:: `__getitem__`, `cf.masked`, `hardmask`, `where`
 
         **Examples:**
@@ -1154,9 +1166,7 @@ def __setitem__(self, indices, value):
             #           can be fixed, it's easiest to disallow this
             #           case, that was allowed pre-dask.
 
-        # ------------------------------------------------------------
         # Roll axes with cyclic slices
-        # ------------------------------------------------------------
         if roll:
             # For example, if assigning to slice(-2, 3) has been
             # requested on a cyclic axis (and we're not using numpy
@@ -1180,14 +1190,17 @@ def __setitem__(self, indices, value):
         dx = self._get_dask()
         dx[indices] = dask_compatible(value)
 
-        # ------------------------------------------------------------
         # Unroll any axes that were rolled to enable a cyclic
         # assignment
-        # ------------------------------------------------------------
         if roll:
             shifts = [-shift for shift in shifts]
             self.roll(shift=shifts, axis=roll_axes, inplace=True)
 
+        # Reset the mask hardness, otherwise it could be incorrect in
+        # the case that a chunk that was not a masked array is
+        # assigned missing values.
+        self.hardmask = self.hardmask
+
         return
 
     # ----------------------------------------------------------------
@@ -1269,7 +1282,9 @@ def __keepdims_indexing__(self, value):
     def _get_dask(self):
         """Get the dask array.
 
-        .. versionadded:: 4.0.0
+        .. versionadded:: TODODASK
+
+        .. seealso:: `_set_dask`, `_del_dask`
 
         :Returns:
 
@@ -1282,7 +1297,9 @@ def _get_dask(self):
     def _set_dask(self, array, copy=False, delete_source=True):
         """Set the dask array.
 
-        .. versionadded:: 4.0.0
+        .. versionadded:: TODODASK
+
+        .. seealso:: `_get_dask`, `_del_dask`
 
         :Parameters:
 
@@ -1314,7 +1331,9 @@ def _set_dask(self, array, copy=False, delete_source=True):
     def _del_dask(self, default=ValueError(), delete_source=True):
         """Remove the dask array.
 
-        .. versionadded:: 4.0.0
+        .. versionadded:: TODODASK
+
+        .. seealso:: `_set_dask`, `_get_dask`
 
         :Parameters:
 
@@ -1364,10 +1383,9 @@ def _del_dask(self, default=ValueError(), delete_source=True):
         return out
 
     def _dask_map_blocks(self, func, **kwargs):
-        """Apply a function in-place to the dask array using
-        `map_blocks`.
+        """Apply a function to the dask array using `map_blocks`.
 
-        .. versionadded:: 4.0.0
+        .. versionadded:: TODODASK
 
         :Parameters:
 
@@ -1376,8 +1394,8 @@ def _dask_map_blocks(self, func, **kwargs):
                 array.
 
             kwargs: optional
-                Keyword arguments passed to the map `map_blocks`
-                method of the dask array.
+                Keyword arguments passed to the
+                `dask.Array.map_blocks` method of the dask array.
 
         :Returns:
 
@@ -2459,14 +2477,14 @@ def loadd(self, d, chunk=True):
         self._size = reduce(mul, shape, 1)
 
         cyclic = d.get("_cyclic", None)
-        # Never change the _cyclic attribute in-place
+        # Never change the value of the _cyclic attribute in-place
         if cyclic:
             self._cyclic = cyclic.copy()
         else:
             self._cyclic = _empty_set
 
         HDF_chunks = d.get("_HDF_chunks", None)
-        # Never change the _HDF_chunks attribute in-place
+        # Never change the value of the _HDF_chunks attribute in-place
         if HDF_chunks:
             self._HDF_chunks = HDF_chunks.copy()
         else:
@@ -5578,13 +5596,12 @@ def _Units(self):
     def _cyclic(self):
         """Storage for axis cyclicity.
 
-        Identifies which axes are cyclic (and therefore allow cyclic
-        slicing).
-
-        It must be a subset of the axis identifiers given by the
-        `_axes` attribute.
+        Contains a `set` that identifies which axes are cyclic (and
+        therefore allow cyclic slicing). The set contains a subset of
+        the axis identifiers defined by the `_axes` attribute.
 
-        .. warning:: Never change the `_cyclic` attribute in-place.
+        .. warning:: Never change the value of the `_cyclic` attribute
+                     in-place.
 
         .. note:: When an axis identifier is removed from the `_axes`
                   attribute then it is automatically also removed from
@@ -5619,8 +5636,14 @@ def _HDF_chunks(self):
         del self._custom["_HDF_chunks"]
 
     @property
+    @daskified(1)
     def _hardmask(self):
-        """TODODASK."""
+        """Storage for the mask hardness.
+
+        Contains a `bool`, where `True` denotes a hard mask and `False`
+        denotes a soft mask.
+
+        """
         return self._custom["_hardmask"]
 
     @_hardmask.setter
@@ -5636,26 +5659,24 @@ def _hardmask(self):
     def _axes(self):
         """Storage for the axis identifiers.
 
-        Contains an ordered sequence of identifiers for each array
-        axis.
+        Contains a `tuple` of identifiers, one for each array axis.
 
         .. note:: When the axis identifiers are reset, then any axis
                   identifier named by the `_cyclic` attribute which is
                   not in the new `_axes` set is automatically removed
-                  `_cyclic` attribute.
+                  from the `_cyclic` attribute.
 
         """
         return self._custom["_axes"]
 
     @_axes.setter
     def _axes(self, value):
-        value = tuple(value)
-        self._custom["_axes"] = value
+        self._custom["_axes"] = tuple(value)
 
-        # Update cyclic: Remove cyclic axes that are not in the new
-        # axes (never update _cyclic in-place)
+        # Remove cyclic axes that are not in the new axes
         cyclic = self._cyclic
         if cyclic:
+            # Never change the value of the _cyclic attribute in-place
             self._cyclic = cyclic.intersection(value)
 
     # ----------------------------------------------------------------
@@ -5799,7 +5820,8 @@ def dtype(self):
     def dtype(self, value):
         dx = self._get_dask()
 
-        # Only change the datatype if it's different
+        # Only change the datatype if it's different to that of the
+        # dask array
         if dx.dtype != value:
             dx = dx.astype(value)
             self._set_dask(dx)
@@ -5841,11 +5863,15 @@ def hardmask(self):
 
         If the `hardmask` attribute is `True`, i.e. there is a hard
         mask, then unmasking an entry will silently not occur. This
-        feature prevents overwriting the mask. To allow the unmasking
-        of an entries when the data has a hard mask, the mask must
-        first to be softened using the `soften_mask` method, that
-        changes the `hardmask` attribute to `False`. The mask can be
-        re-hardened with `harden_mask` method.
+        feature prevents overwriting the mask.
+
+        To allow the unmasking of an entries when the data has a hard
+        mask, the mask must first to be softened, either by setting
+        the `hardmask` attribute to False or equivalently with the
+        `soften_mask` method.
+
+        The mask can be hardened by setting the `hardmask` attribute
+        to True or equivalently with the `harden_mask` method.
 
         .. seealso:: `harden_mask`, `soften_mask`, `where`
 
@@ -5882,11 +5908,14 @@ def hardmask(self):
         [9 9 9]
 
         """
-        return self._custom["_hardmask"]
+        return self._hardmask
 
     @hardmask.setter
     def hardmask(self, value):
-        raise AttributeError("TODODASK - use harden_mask/soften_mask instead")
+        if value:
+            self.harden_mask()
+        else:
+            self.soften_mask()
 
     @property
     @daskified(1)
@@ -5961,6 +5990,11 @@ def nbytes(self):
 
         Does not include bytes consumed by the array mask
 
+        **Performance**
+
+        If the number of bytes is unknown then it is calculated
+        immediately by executing all delayed operations.
+
         **Examples:**
 
         >>> d = cf.Data([[1, 1.5, 2]])
@@ -5978,9 +6012,13 @@ def nbytes(self):
 
         """
         dx = self._get_dask()
-        return dx.nbytes
+        if math.isnan(dx.size):
+            logger.warning(
+                "Computing data nbytes: Performance may be degraded"
+            )
+            dx.compute_chunk_sizes()
 
-    # TODODASK - what about nans (e.g. after da.unique)
+        return dx.nbytes
 
     @property
     @daskified(1)
@@ -6018,6 +6056,11 @@ def ndim(self):
     def shape(self):
         """Tuple of the data array's dimension sizes.
 
+        **Performance**
+
+        If the shape of the data is unknown then it is calculated
+        immediately by executing all delayed operations.
+
         **Examples:**
 
         >>> d = cf.Data([[1, 2, 3], [4, 5, 6]])
@@ -6038,16 +6081,22 @@ def shape(self):
 
         """
         dx = self._get_dask()
-        #        if nan: do some compute
-        return dx.shape
+        if math.isnan(dx.size):
+            logger.warning("Computing data shape: Performance may be degraded")
+            dx.compute_chunk_sizes()
 
-    # TODODASK - what about nans (e.g. after da.unique  dx.shape -> (nan,))
+        return dx.shape
 
     @property
     @daskified(1)
     def size(self):
         """Number of elements in the data array.
 
+        **Performance**
+
+        If the size of the data is unknown then it is calculated
+        immediately by executing all delayed operations.
+
         **Examples:**
 
         >>> d = cf.Data([[1, 2, 3], [4, 5, 6]])
@@ -6072,9 +6121,13 @@ def size(self):
 
         """
         dx = self._get_dask()
-        return dx.size
+        size = dx.size
+        if math.isnan(size):
+            logger.warning("Computing data size: Performance may be degraded")
+            dx.compute_chunk_sizes()
+            size = dx.size
 
-    # TODODASK - what about nans (e.g. after da.unique)
+        return size
 
     @property
     @daskified(1)
@@ -6188,7 +6241,6 @@ def datetime_array(self):
                 a.harden_mask()
             else:
                 a.soften_mask()
-        # --- End: if
 
         return a
 
@@ -8539,7 +8591,7 @@ def cyclic(self, axes=None, iscyclic=True):
 
         axes = [data_axes[i] for i in self._parse_axes(axes)]
 
-        # Never change the _cyclic attribute in-place
+        # Never change the value of the _cyclic attribute in-place
         if iscyclic:
             self._cyclic = cyclic_axes.union(axes)
         else:
@@ -9512,15 +9564,14 @@ def harden_mask(self):
         """Force the mask to hard.
 
         Whether the mask of a masked array is hard or soft is
-        determined by its the `hardmask` property. `harden_mask` sets
+        determined by its `hardmask` property. `harden_mask` sets
         `hardmask` to True.
 
+        .. versionadded:: TODODASK
 
-        **Examples:**
+        .. seealso:: `hardmask`, `soften_mask`
 
-        >>> d = cf.Data([1, 2, 3])
-        >>> d.hardmask
-        True
+        **Examples:**
 
         >>> d = cf.Data([1, 2, 3], hardmask=False)
         >>> d.hardmask
@@ -9529,24 +9580,27 @@ def harden_mask(self):
         >>> d.hardmask
         True
 
-        """
-
-        def harden_mask(a):
-            if np.ma.isMA(a):
-                a.harden_mask()
+        >>> d = cf.Data([1, 2, 3], mask=[False, True, False])
+        >>> d.hardmask
+        True
+        >>> d[1] = 999
+        >>> print(d.array)
+        [1 -- 3]
 
-            return a
+        """
+        self._dask_map_blocks(_harden_mask_chunk, dtype=self.dtype)
+        self._hardmask = True
 
-        if self._hardmask:
-            # Mask is already hard
-            return
+    def soften_mask(self):
+        """Force the mask to soft.
 
-        self._dask_map_blocks(harden_mask, dtype=self.dtype)
+        Whether the mask of a masked array is hard or soft is
+        determined by its `hardmask` property. `soften_mask` sets
+        `hardmask` to False.
 
-        self._hardmask = True
+        .. versionadded:: TODODASK
 
-    def soften_mask(self):
-        """TODODASK.
+        .. seealso:: `hardmask`, `harden_mask`
 
         **Examples:**
 
@@ -9557,32 +9611,17 @@ def soften_mask(self):
         >>> d.hardmask
         False
 
-        """
-
-        def soften_mask(a):
-            if np.ma.isMA(a):
-                a.soften_mask()
-
-            return a
-
-        if not self._hardmask:
-            # Mask is already soft
-            return
-
-        self._dask_map_blocks(soften_mask, dtype=self.dtype)
+        >>> d = cf.Data([1, 2, 3], mask=[False, True, False], hardmask=False)
+        >>> d.hardmask
+        False
+        >>> d[1] = 999
+        >>> print(d.array)
+        [1  999  3]
 
+        """
+        self._dask_map_blocks(_soften_mask_chunk, dtype=self.dtype)
         self._hardmask = False
 
-    def _set_mask_hardness(self, hardmask=None):
-        """TODODASK."""
-        if hardmask is None:
-            hardmask = self.hardmask
-
-        if hardmask:
-            self.harden_mask()
-        else:
-            self.soften_mask()
-
     @_inplace_enabled(default=False)
     def filled(self, fill_value=None, inplace=False):
         """Replace masked elements with the fill value.
@@ -10746,8 +10785,8 @@ def HDF_chunks(self, *chunks):
         chunks = chunks[0]
 
         if chunks is None:
-            # Clear all chunking. Never change the _HDF_chunks
-            # attribute in-place.
+            # Clear all chunking. Never change the value of the
+            # _HDF_chunks attribute in-place.
             self._HDF_chunks = None
             return org_HDF_chunks
 
@@ -10758,7 +10797,7 @@ def HDF_chunks(self, *chunks):
         if _HDF_chunks.values() == [None] * self.ndim:
             _HDF_chunks = None
 
-        # Never change the _HDF_chunks attribute in-place
+        # Never change the value of the _HDF_chunks attribute in-place
         self._HDF_chunks = _HDF_chunks
 
         return org_HDF_chunks
@@ -12142,7 +12181,7 @@ def squeeze(self, axes=None, inplace=False, i=False):
 
         hdf = self._HDF_chunks
         if hdf:
-            # Never change the _HDF_chunks attribute in-place
+            # Never change the value of the _HDF_chunks attribute in-place
             self._HDF_chunks = {
                 axis: size for axis, size in hdf.items() if axis not in axes
             }
@@ -13481,3 +13520,54 @@ def _broadcast(a, shape):
     tile = shape[0 : len(shape) - len(a_shape)] + tuple(tile[::-1])
 
     return np.tile(a, tile)
+
+
+"""Dask utilities to be called on chunks"""
+
+
+def _harden_mask_chunk(a):
+    """Harden the mask of a masked `numpy` array.
+
+    Has no effect if the array is not a masked array.
+
+    :Parameters:
+
+        a: `numpy.ndarray`
+            The array to have a hardened mask.
+
+    :Returns:
+
+        `numpy.ndarray`
+            The array with hardened mask.
+
+    :Returns:
+
+        `numpy.ndarray`
+
+    """
+    if np.ma.isMA(a):
+        a.harden_mask()
+
+    return a
+
+
+def _soften_mask_chunk(a):
+    """Soften the mask of a maked `numpy` array.
+
+    Has no effect if the array is not a masked array.
+
+    :Parameters:
+
+        a: `numpy.ndarray`
+            The array to have a softened mask.
+
+    :Returns:
+
+        `numpy.ndarray`
+            The array with softened mask.
+
+    """
+    if np.ma.isMA(a):
+        a.soften_mask()
+
+    return a
diff --git a/cf/field.py b/cf/field.py
index d40ed264f4..cf7e46ac80 100644
--- a/cf/field.py
+++ b/cf/field.py
@@ -460,13 +460,9 @@ def __getitem__(self, indices):
                         f"{self.constructs.domain_axis_identity(_)!r} axis"
                     )
 
-                logger.debug(
-                    f"    roll, iaxis, shift = {roll} {iaxis} {shift}"
-                )  # pragma: no cover
-
-                new = new.roll(iaxis, shift)
+                new = new.roll(shift=shift, axis=iaxis)
         else:
-            new = self.copy()
+            new = self.copy(array=False)
 
         # ------------------------------------------------------------
         # Subspace the field construct's data
@@ -482,7 +478,7 @@ def __getitem__(self, indices):
         logger.debug("    indices2 = {}".format(indices2))  # pragma: no cover
         logger.debug("    findices = {}".format(findices))  # pragma: no cover
 
-        new_data = new.data[tuple(findices)]
+        new_data = data[tuple(findices)]
 
         # Set sizes of domain axes
         data_axes = new.get_data_axes()
diff --git a/cf/functions.py b/cf/functions.py
index c8e48540e0..f45ca37429 100644
--- a/cf/functions.py
+++ b/cf/functions.py
@@ -32,11 +32,9 @@
 from numpy import allclose as _x_numpy_allclose
 from numpy import ascontiguousarray as _numpy_ascontiguousarray
 from numpy import isclose as _x_numpy_isclose
-from numpy import ndim as _numpy_ndim
 from numpy import shape as _numpy_shape
 from numpy import take as _numpy_take
 from numpy import tile as _numpy_tile
-from numpy import where as _numpy_where
 from numpy.ma import all as _numpy_ma_all
 from numpy.ma import allclose as _numpy_ma_allclose
 from numpy.ma import is_masked as _numpy_ma_is_masked
@@ -1943,12 +1941,8 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True):
         )
 
     for i, (index, size) in enumerate(zip(parsed_indices, shape)):
-        is_slice = False
-        if isinstance(index, slice):
-            # --------------------------------------------------------
-            # Index is a slice
-            # --------------------------------------------------------
-            is_slice = True
+        if cyclic and isinstance(index, slice):
+            # Check for a cyclic slice
             start = index.start
             stop = index.stop
             step = index.step
@@ -1998,11 +1992,8 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True):
                 # -9:0:1  => [1, 2, 3, 4, 5, 6, 7, 8, 9]
                 # -9:1:1  => [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
                 # -10:0:1 => [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
-                if cyclic:
-                    index = slice(0, stop - start, step)
-                    roll[i] = -start
-                else:
-                    index = slice(start, stop, step)
+                index = slice(0, stop - start, step)
+                roll[i] = -start
 
             elif step < 0 and 0 <= start < size and start - size <= stop < 0:
                 # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
@@ -2012,46 +2003,15 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True):
                 # 6:-4:-1  => [6, 5, 4, 3, 2, 1, 0, 9, 8, 7]
                 # 0:-2:-1  => [0, 9]
                 # 0:-10:-1 => [0, 9, 8, 7, 6, 5, 4, 3, 2, 1]
-                if cyclic:
-                    index = slice(start - stop - 1, None, step)
-                    roll[i] = -1 - stop
-                else:
-                    index = slice(start, stop, step)
+                index = slice(start - stop - 1, None, step)
+                roll[i] = -1 - stop
 
+        elif keepdims and isinstance(index, Integral):
+            # Convert an integral index to a slice
+            if index == -1:
+                index = slice(-1, None, None)
             else:
-                start, stop, step = index.indices(size)
-                if (
-                    start == stop
-                    or (start < stop and step < 0)
-                    or (start > stop and step > 0)
-                ):
-                    raise IndexError(
-                        f"Invalid indices dimension with size {size}: {index}"
-                    )
-
-                if step < 0 and stop < 0:
-                    stop = None
-                index = slice(start, stop, step)
-
-        elif isinstance(index, Integral):
-            # --------------------------------------------------------
-            # Index is an integer
-            # --------------------------------------------------------
-            if index < 0:
-                index += size
-
-            if keepdims:
                 index = slice(index, index + 1, 1)
-                is_slice = True
-
-        if is_slice:
-            # If step is greater than one then make sure that
-            # index.stop isn't bigger than it needs to be
-            if cyclic and index.step > 1:
-                start, stop, step = index.indices(size)
-                div, mod = divmod(stop - start - 1, step)
-                stop = start + div * step + 1
-                index = slice(start, stop, step)
 
         parsed_indices[i] = index
 
diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py
index b0c8afac6a..73839b9d9a 100644
--- a/cf/test/test_Data.py
+++ b/cf/test/test_Data.py
@@ -855,7 +855,9 @@ def test_Data__getitem__(self):
             ([1, 4, 3], -2, [2, -1]),
             ([4, 1, 4], -2, [2, -1]),
         ):
-            self.assertEqual(d[indices].shape, (3, 1, 2))
+            e = d[indices]
+            self.assertEqual(e.shape, (3, 1, 2))
+            self.assertEqual(e._axes, d._axes)
 
         d.__keepdims_indexing__ = False
         self.assertFalse(d.__keepdims_indexing__)
@@ -867,7 +869,10 @@ def test_Data__getitem__(self):
             ([4, 3, 4], -2, [2, -1]),
             ([1, 4, 4], -2, [2, -1]),
         ):
-            self.assertEqual(d[indices].shape, (3, 2))
+            e = d[indices]
+            self.assertFalse(e.__keepdims_indexing__)
+            self.assertEqual(e.shape, (3, 2))
+            self.assertEqual(e._axes, d._axes[0::2])
 
         self.assertFalse(d.__keepdims_indexing__)
         d.__keepdims_indexing__ = True
@@ -881,10 +886,18 @@ def test_Data__getitem__(self):
         d.cyclic(1)
         self.assertTrue((d[0, :6].array == [[0, 1, 2, 3, 4, 5]]).all())
         e = d[0, -2:4]
+        self.assertEqual(e._axes, d._axes)
         self.assertEqual(e.shape, (1, 6))
         self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3]]).all())
         self.assertFalse(e.cyclic())
 
+        d.__keepdims_indexing__ = False
+        e = d[:, 4]
+        self.assertEqual(e.shape, (3,))
+        self.assertFalse(e.cyclic())
+        self.assertEqual(e._axes, d._axes[0:1])
+        d.__keepdims_indexing__ = True
+
         e = d[0, -2:6]
         self.assertEqual(e.shape, (1, 8))
         self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3, 4, 5]]).all())
@@ -944,7 +957,7 @@ def test_Data__getitem__(self):
         # TODODASK: Test __getitem__ with ancillary masks. Can only do
         #           this when cf.Data.where has been daskified
 
-    def test_Data___setitem__(self):
+    def test_Data__setitem__(self):
         if self.test_only and inspect.stack()[0][3] not in self.test_only:
             return
 
@@ -955,9 +968,9 @@ def test_Data___setitem__(self):
             else:
                 a.soften_mask()
 
-            d = cf.Data(a.copy(), "metres", hardmask=hardmask)
+            d = cf.Data(a.copy(), "metres", hardmask=hardmask, chunks=(3, 5))
 
-            a[:, 1] = cf.masked
+            a[:, 1] = np.ma.masked
             d[:, 1] = cf.masked
 
             a[0, 2] = -6
@@ -997,7 +1010,7 @@ def test_Data___setitem__(self):
         self.assertTrue((d.array == a).all())
         self.assertTrue((d.array.mask == a.mask).all())
 
-        # Cyclic
+        # Cyclic axes
         d.cyclic(1)
         self.assertTrue((d[0].array == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).all())
         d[0, -1:1] = [-99, -1]
@@ -1006,6 +1019,7 @@ def test_Data___setitem__(self):
         )
         self.assertEqual(d.cyclic(), set([1]))
 
+        # Multiple list/1-d array indices
         with self.assertRaises(NotImplementedError):
             d[[1, 2], [0, 4, 1]] = 9
 

From 021e068d2558e3d3b152f8fe4c1e02113a61910a Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Thu, 9 Sep 2021 09:04:26 +0100
Subject: [PATCH 06/12] information

---
 cf/data/QUESTIONS.rst | 25 +++++++++++++++++++++++++
 cf/data/README.rst    | 26 ++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 cf/data/QUESTIONS.rst
 create mode 100644 cf/data/README.rst

diff --git a/cf/data/QUESTIONS.rst b/cf/data/QUESTIONS.rst
new file mode 100644
index 0000000000..9fcd8e3f12
--- /dev/null
+++ b/cf/data/QUESTIONS.rst
@@ -0,0 +1,25 @@
+Questions and answers
+=====================
+
+A place to record randoms thought about the daskification of
+`cf.Data`, possibly prior to starting an issue on GitHub.
+
+----
+
+Q. When we run something that executes all of the lazy operations
+   (like `cf.Data.is_masked`), should/could we replace the dask array
+   with a "persisted" version of the computed data? If we did this, we
+   would want to have the ability to cache persisted chunks to disk,
+   as they came into being on each thread (see, for instance,
+   `chest`). Do do this or not this could be controlled by a
+   configuation setting.
+
+A. ?
+
+----
+
+Q.
+
+A. ?
+
+----
diff --git a/cf/data/README.rst b/cf/data/README.rst
new file mode 100644
index 0000000000..2b4cbe709e
--- /dev/null
+++ b/cf/data/README.rst
@@ -0,0 +1,26 @@
+`cf.Data` developer notes
+=========================
+
+Hardness of the mask
+--------------------
+
+Every `cf.Data` method that changes the dask array should consider
+whether or not the mask hardness needs resetting before
+returning. This will be necessary if there is the possibility that the
+operation being applied to the dask array could lose the "memory" on
+its chunks of whether or not the mask is hard.
+
+A common situation that causes a chunk to lose its memory of whether
+or not the mask is hard is when a chunk could have contained a
+unmasked `numpy` array prior to the operation, but the operation could
+convert it to a masked `numpy` array. The new masked array will always
+have the `numpy` default hardness (i.e. soft), which may be
+incorrect.
+
+The mask hardness is reset as follows:
+
+.. code-block:: python
+
+   self._reset_mask_hardness()
+
+See `cf.Data.__setitem__` for an example.

From 9c1a4c42c7204d580f598d2a2c0528cb39cae449 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Thu, 9 Sep 2021 09:05:08 +0100
Subject: [PATCH 07/12] tidy

---
 cf/data/data.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/cf/data/data.py b/cf/data/data.py
index a18cdb243e..2999d7a86a 100644
--- a/cf/data/data.py
+++ b/cf/data/data.py
@@ -1199,7 +1199,7 @@ def __setitem__(self, indices, value):
         # Reset the mask hardness, otherwise it could be incorrect in
         # the case that a chunk that was not a masked array is
         # assigned missing values.
-        self.hardmask = self.hardmask
+        self._reset_mask_hardness()
 
         return
 
@@ -1416,6 +1416,18 @@ def _dask_map_blocks(self, func, **kwargs):
 
         return dx
 
+    def _reset_mask_hardness(self):
+        """Re-apply the mask hardness to the dask array.
+
+        .. versionadded:: TODODASK
+
+        :Returns:
+
+            `None`
+
+        """
+        self.hardmask = self.hardmask
+
     @_inplace_enabled(default=False)
     def diff(self, axis=-1, n=1, inplace=False):
         """Calculate the n-th discrete difference along the given axis.
@@ -5936,13 +5948,6 @@ def is_masked(self):
         True
 
         """
-        # TODODASK: (General point) When we run something that
-        #           executes all of the lazy operations (like
-        #           is_masked), should/could we replace the dask array
-        #           of self with a "persisted" version of the computed
-        #           data? If we did this, we would want to have the
-        #           ability to cache persisted chunks to disk, as they
-        #           came into being on each thread.
 
         def is_masked(a):
             out = np.ma.is_masked(a)
@@ -10306,7 +10311,8 @@ def datum(self, *index):
         """Return an element of the data array as a standard Python
         scalar.
 
-        TODODASK consider renameing/aliasing to 'item'
+        TODODASK: consider renameing/aliasing to 'item'. Might depend
+                  on whether or not the APIs are the same.
 
         The first and last elements are always returned with
         ``d.datum(0)`` and ``d.datum(-1)`` respectively, even if the data

From 9d3f5a7b84b0ef661d7635142231e4935aaa2d0f Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Thu, 9 Sep 2021 10:39:40 +0100
Subject: [PATCH 08/12] hardmask

---
 cf/data/README.rst   | 12 +++----
 cf/data/data.py      | 80 ++++++++++++++++++++------------------------
 cf/test/test_Data.py |  2 +-
 3 files changed, 43 insertions(+), 51 deletions(-)

diff --git a/cf/data/README.rst b/cf/data/README.rst
index 2b4cbe709e..6e404b720e 100644
--- a/cf/data/README.rst
+++ b/cf/data/README.rst
@@ -4,7 +4,7 @@
 Hardness of the mask
 --------------------
 
-Every `cf.Data` method that changes the dask array should consider
+Any `cf.Data` method that changes the dask array should consider
 whether or not the mask hardness needs resetting before
 returning. This will be necessary if there is the possibility that the
 operation being applied to the dask array could lose the "memory" on
@@ -17,10 +17,8 @@ convert it to a masked `numpy` array. The new masked array will always
 have the `numpy` default hardness (i.e. soft), which may be
 incorrect.
 
-The mask hardness is reset as follows:
+The mask hardness is most easily reset with the
+`cf.Data._reset_mask_hardness` method.
 
-.. code-block:: python
-
-   self._reset_mask_hardness()
-
-See `cf.Data.__setitem__` for an example.
+`cf.Data.__setitem__` and `cf.Data.where` are examples of methods that
+need to reset the mask in this manner.
diff --git a/cf/data/data.py b/cf/data/data.py
index 2999d7a86a..6ede789f14 100644
--- a/cf/data/data.py
+++ b/cf/data/data.py
@@ -1382,20 +1382,26 @@ def _del_dask(self, default=ValueError(), delete_source=True):
 
         return out
 
-    def _dask_map_blocks(self, func, **kwargs):
-        """Apply a function to the dask array using `map_blocks`.
+    def _map_blocks(self, func, **kwargs):
+        """Apply a function to the data in-place.
+
+        .. note:: It may be necessary for a call to `_map_blocks` to
+                  be followed by a call to `_reset_mask_hardness`.
 
         .. versionadded:: TODODASK
 
+        .. seealso:: `_reset_mask_hardness`
+
         :Parameters:
 
             func:
-                The funciton to be applied to each chunk of the dask
+                The funciton to be applied to the data, via
+                `dask.Array.map_blocks`, to each chunk of the dask
                 array.
 
             kwargs: optional
                 Keyword arguments passed to the
-                `dask.Array.map_blocks` method of the dask array.
+                `dask.Array.map_blocks` method.
 
         :Returns:
 
@@ -1405,7 +1411,7 @@ def _dask_map_blocks(self, func, **kwargs):
         **Examples:**
 
         >>> d = cf.Data([1, 2, 3])
-        >>> dx = d._dask_map_blocks(lambda x: x / 2, dtype=float)
+        >>> dx = d._map_blocks(lambda x: x / 2)
         >>> print(d.array)
         [0.5 1.  1.5]
 
@@ -5652,8 +5658,10 @@ def _HDF_chunks(self):
     def _hardmask(self):
         """Storage for the mask hardness.
 
-        Contains a `bool`, where `True` denotes a hard mask and `False`
-        denotes a soft mask.
+        Contains a `bool`, where `True` denotes a hard mask and
+        `False` denotes a soft mask.
+
+        See `hardmask` for details.
 
         """
         return self._custom["_hardmask"]
@@ -5662,10 +5670,6 @@ def _hardmask(self):
     def _hardmask(self, value):
         self._custom["_hardmask"] = value
 
-    @_hardmask.deleter
-    def _hardmask(self):
-        del self._custom["_hardmask"]
-
     @property
     @daskified(1)
     def _axes(self):
@@ -5758,9 +5762,9 @@ def Units(self, value):
                 dtype = _dtype_float32
             else:
                 dtype = _dtype_float
-        # --- End: if
 
-        self._dask_map_blocks(
+        # Apply the units conversion to the data
+        self._map_blocks(
             partial(
                 Units.conform,
                 from_units=old_units,
@@ -5874,18 +5878,22 @@ def hardmask(self):
         """Hardness of the mask.
 
         If the `hardmask` attribute is `True`, i.e. there is a hard
-        mask, then unmasking an entry will silently not occur. This
-        feature prevents overwriting the mask.
+        mask, then unmasking an entry will silently not occur. This is
+        the default, and prevents overwriting the mask.
+
+        If the `hardmask` attribute is `False`, i.e. there is a soft
+        mask, then masked entries may be overwritten with non-missing
+        values.
 
-        To allow the unmasking of an entries when the data has a hard
-        mask, the mask must first to be softened, either by setting
-        the `hardmask` attribute to False or equivalently with the
-        `soften_mask` method.
+        To allow the unmasking of masked values, the mask must be
+        softened by setting the `hardmask` attribute to False, or
+        equivalently with the `soften_mask` method.
 
         The mask can be hardened by setting the `hardmask` attribute
-        to True or equivalently with the `harden_mask` method.
+        to True, or equivalently with the `harden_mask` method.
 
-        .. seealso:: `harden_mask`, `soften_mask`, `where`
+        .. seealso:: `harden_mask`, `soften_mask`, `where`,
+                     `__setitem__`
 
         **Examples:**
 
@@ -5895,29 +5903,15 @@ def hardmask(self):
         >>> d[0] = cf.masked
         >>> print(d.array)
         [-- 2 3]
-        >>> d[...]= 9
+        >>> d[...]= 999
         >>> print(d.array)
-        [-- 9 9]
-        >>> d.soften_mask()
+        [-- 999 999]
+        >>> d.hardmask = False
         >>> d.hardmask
         False
         >>> d[...] = -1
-        False
         >>> print(d.array)
         [-1 -1 -1]
-        >>> d.harden_mask()
-        >>> d.hardmask
-        True
-        >>> d[0] = cf.masked
-        >>> d = d.where(True, 9)
-        >>> print(d.array)
-        [-- 9 9]
-        >>> d.soften_mask()
-        >>> d.hardmask
-        False
-        >>> d = d.where(True, 9)
-        >>> print(d.array)
-        [9 9 9]
 
         """
         return self._hardmask
@@ -9570,7 +9564,7 @@ def harden_mask(self):
 
         Whether the mask of a masked array is hard or soft is
         determined by its `hardmask` property. `harden_mask` sets
-        `hardmask` to True.
+        `hardmask` to `True`.
 
         .. versionadded:: TODODASK
 
@@ -9593,7 +9587,7 @@ def harden_mask(self):
         [1 -- 3]
 
         """
-        self._dask_map_blocks(_harden_mask_chunk, dtype=self.dtype)
+        self._map_blocks(_harden_mask_chunk, dtype=self.dtype)
         self._hardmask = True
 
     def soften_mask(self):
@@ -9601,7 +9595,7 @@ def soften_mask(self):
 
         Whether the mask of a masked array is hard or soft is
         determined by its `hardmask` property. `soften_mask` sets
-        `hardmask` to False.
+        `hardmask` to `False`.
 
         .. versionadded:: TODODASK
 
@@ -9621,10 +9615,10 @@ def soften_mask(self):
         False
         >>> d[1] = 999
         >>> print(d.array)
-        [1  999  3]
+        [  1 999   3]
 
         """
-        self._dask_map_blocks(_soften_mask_chunk, dtype=self.dtype)
+        self._map_blocks(_soften_mask_chunk, dtype=self.dtype)
         self._hardmask = False
 
     @_inplace_enabled(default=False)
diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py
index 73839b9d9a..6dd4648ab5 100644
--- a/cf/test/test_Data.py
+++ b/cf/test/test_Data.py
@@ -824,7 +824,7 @@ def test_Data__getitem__(self):
         if self.test_only and inspect.stack()[0][3] not in self.test_only:
             return
 
-        d = cf.Data(np.ma.arange(450).reshape(9, 10, 5))
+        d = cf.Data(np.ma.arange(450).reshape(9, 10, 5), chunks=(4, 5, 1))
 
         for indices in (
             Ellipsis,

From 742c3e202d1fba040598493edf780477d39ec438 Mon Sep 17 00:00:00 2001
From: David Hassell <david.hassell@ncas.ac.uk>
Date: Mon, 13 Sep 2021 15:58:56 +0100
Subject: [PATCH 09/12] dask reset_mask_hardness, docs

---
 cf/data/data.py  | 145 +++++++++++++++++++++++++++++++++--------------
 cf/data/utils.py |   8 +--
 2 files changed, 105 insertions(+), 48 deletions(-)

diff --git a/cf/data/data.py b/cf/data/data.py
index 6ede789f14..91f228481c 100644
--- a/cf/data/data.py
+++ b/cf/data/data.py
@@ -585,7 +585,7 @@ def __init__(
             self._Units = units
 
         # Store the dask array
-        self._set_dask(array, delete_source=False)
+        self._set_dask(array, delete_source=False, reset_mask_hardness=False)
 
         # Set the mask hardness on each chunk.
         self.hardmask = hardmask
@@ -1049,15 +1049,17 @@ def __getitem__(self, indices):
                     ]
                     dx = dx[tuple(slice_indices)]
         else:
-            raise ValueError("Orthogonal indexing is currently required.")
+            raise NotImplementedError(
+                "Non-orthogonal indexing has not yet been implemented"
+            )
 
         # ------------------------------------------------------------
         # Set the subspaced dask array
         # ------------------------------------------------------------
-        new._set_dask(dx)
+        new._set_dask(dx, reset_mask_hardness=False)
 
         # ------------------------------------------------------------
-        # Set the axis identifiers for the subspace
+        # Get the axis identifiers for the subspace
         # ------------------------------------------------------------
         shape0 = shape
         if keepdims:
@@ -1160,11 +1162,11 @@ def __setitem__(self, indices, value):
             # TODODASK: The inherited algorithm that does assignment
             #           for multiple list/1-d array indices
             #           (cfdm.Data._set_subspace) won't work when the
-            #           1-d array is a dask array because the array
-            #           may need to be computed at __setitem__
-            #           runtime, which is not desirable. Until this
-            #           can be fixed, it's easiest to disallow this
-            #           case, that was allowed pre-dask.
+            #           1-d array is a dask array because it may need
+            #           to be computed at __setitem__ runtime, which
+            #           is not desirable. Until this can be fixed,
+            #           it's easiest to disallow this case, that was
+            #           allowed pre-dask.
 
         # Roll axes with cyclic slices
         if roll:
@@ -1222,13 +1224,21 @@ def __orthogonal_indexing__(self):
                      `__setitem__`,
                      `netCDF4.Variable.__orthogonal_indexing__`
 
-        **Examples:**
+        **Examples**
 
-        >>> d = cf.Data([[1, 2, 3], [4, 5, 6]])
-        >>> d[[0], [0, 2]].shape
+        >>> d = cf.Data([[1, 2, 3],
+        ...              [4, 5, 6]])
+        >>> e = d[[0], [0, 2]]
+        >>> e.shape
         (1, 2)
-        >>> d[[0, 1], [0, 2]].shape
+        >>> print(e.array)
+        [[1 3]]
+        >>> e = d[[0, 1], [0, 2]]
+        >>> e.shape
         (2, 2)
+        >>> print(e.array)
+        [[1 3]
+         [4 6]]
 
         """
         return True
@@ -1251,23 +1261,57 @@ def __keepdims_indexing__(self):
         .. seealso:: `__orthogonal_indexing__`, `__getitem__`,
                      `__setitem__`
 
-        **Examples:**
+        **Examples**
 
-        >>> d = cf.Data([[1, 2, 3], [4, 5, 6]])
-        >>> d.__keepdims_indexing__ = True
-        >>> d[0].shape
+        >>> d = cf.Data([[1, 2, 3],
+        ...              [4, 5, 6]])
+        >>> d.__keepdims_indexing__
+        True
+        >>> e = d[0]
+        >>> e.shape
         (1, 3)
-        >>> d[:, 1].shape
+        >>> print(e.array)
+        [[1 2 3]]
+
+        >>> d.__keepdims_indexing__
+        True
+        >>> e = d[:, 1]
+        >>> e.shape
         (2, 1)
-        >>> d[0, 1].shape
+        >>> print(e.array)
+        [[2]
+         [5]]
+
+        >>> d.__keepdims_indexing__
+        True
+        >>> e = d[0, 1]
+        >>> e.shape
         (1, 1)
+        >>> print(e.array)
+        [[2]]
+
         >>> d.__keepdims_indexing__ = False
-        >>> d[0].shape
+        >>> e = d[0]
+        >>> e.shape
         (3,)
-        >>> d[:, 1].shape
+        >>> print(e.array)
+        [1 2 3]
+
+        >>> d.__keepdims_indexing__
+        False
+        >>> e = d[:, 1]
+        >>> e.shape
         (2,)
-        >>> d[0, 1].shape
+        >>> print(e.array)
+        [2 5]
+
+        >>> d.__keepdims_indexing__
+        False
+        >>> e = d[0, 1]
+        >>> e.shape
         ()
+        >>> print(e.array)
+        2
 
         """
         return self._custom.get("__keepdims_indexing__", True)
@@ -1294,12 +1338,14 @@ def _get_dask(self):
         """
         return self._custom["dask"]
 
-    def _set_dask(self, array, copy=False, delete_source=True):
+    def _set_dask(
+        self, array, copy=False, delete_source=True, reset_mask_hardness=True
+    ):
         """Set the dask array.
 
         .. versionadded:: TODODASK
 
-        .. seealso:: `_get_dask`, `_del_dask`
+        .. seealso:: `_get_dask`, `_del_dask`, `_reset_mask_hardness`
 
         :Parameters:
 
@@ -1311,7 +1357,14 @@ def _set_dask(self, array, copy=False, delete_source=True):
                 default the dask array is not copied.
 
             delete_source: `bool`, optional
-                TODODASK
+                If False then do not delete a compressed source array,
+                if one exists, after setting the new dask array. By
+                default a compressed source array is deleted.
+
+            reset_mask_hardness: `bool`, optional
+                If False then do not reset the mask hardness after
+                setting the new dask array. By default the mask
+                hardness is re-applied.
 
         :Returns:
 
@@ -1328,6 +1381,9 @@ def _set_dask(self, array, copy=False, delete_source=True):
             # guarantee its consistency with the new dask array.
             self._del_Array(None)
 
+        if reset_mask_hardness:
+            self._reset_mask_hardness()
+
     def _del_dask(self, default=ValueError(), delete_source=True):
         """Remove the dask array.
 
@@ -1385,8 +1441,9 @@ def _del_dask(self, default=ValueError(), delete_source=True):
     def _map_blocks(self, func, **kwargs):
         """Apply a function to the data in-place.
 
-        .. note:: It may be necessary for a call to `_map_blocks` to
-                  be followed by a call to `_reset_mask_hardness`.
+        .. note:: This method does not reset the mask hardness. It may
+                  be necessary for a call to `_map_blocks` to be
+                  followed by a call to `_reset_mask_hardness`.
 
         .. versionadded:: TODODASK
 
@@ -1396,12 +1453,12 @@ def _map_blocks(self, func, **kwargs):
 
             func:
                 The funciton to be applied to the data, via
-                `dask.Array.map_blocks`, to each chunk of the dask
-                array.
+                `dask.array.Array.map_blocks`, to each chunk of the
+                dask array.
 
             kwargs: optional
                 Keyword arguments passed to the
-                `dask.Array.map_blocks` method.
+                `dask.array.Array.map_blocks` method.
 
         :Returns:
 
@@ -1418,7 +1475,7 @@ def _map_blocks(self, func, **kwargs):
         """
         dx = self._get_dask()
         dx = dx.map_blocks(func, **kwargs)
-        self._set_dask(dx)
+        self._set_dask(dx, reset_mask_hardness=False)
 
         return dx
 
@@ -1427,6 +1484,8 @@ def _reset_mask_hardness(self):
 
         .. versionadded:: TODODASK
 
+        .. seealso:: `hardmask`, `harden_mask`, `soften_mask`
+
         :Returns:
 
             `None`
@@ -2212,7 +2271,7 @@ def persist(self, inplace=False):
 
         dx = self._get_dask()
         dx = dx.persist()
-        d._set_dask(dx)
+        d._set_dask(dx, reset_mask_hardness=False)
 
         return d
 
@@ -3151,7 +3210,7 @@ def rechunk(
         dx = d._get_dask()
         dx = dx.rechunk(chunks, threshold, block_size_limit, balance)
 
-        d._set_dask(dx, delete_source=False)
+        d._set_dask(dx, delete_source=False, reset_mask_hardness=False)
 
         return d
 
@@ -4417,7 +4476,7 @@ def _unary_operation(self, operation):
         dx = self._get_dask()
         dx = getattr(operator, operation)(dx)
 
-        out._set_dask(dx)
+        out._set_dask(dx, reset_mask_hardness=False)
 
         return out
 
@@ -5840,7 +5899,7 @@ def dtype(self, value):
         # dask array
         if dx.dtype != value:
             dx = dx.astype(value)
-            self._set_dask(dx)
+            self._set_dask(dx, reset_mask_hardness=False)
 
     @property
     def fill_value(self):
@@ -9129,7 +9188,7 @@ def insert_dimension(self, position=0, inplace=False):
 
         dx = d._get_dask()
         dx = dx.reshape(shape)
-        d._set_dask(dx)
+        d._set_dask(dx, reset_mask_hardness=False)
 
         # Expand _axes
         axis = new_axis_identifier(d._axes)
@@ -9587,7 +9646,7 @@ def harden_mask(self):
         [1 -- 3]
 
         """
-        self._map_blocks(_harden_mask_chunk, dtype=self.dtype)
+        self._map_blocks(_cf_harden_mask, dtype=self.dtype)
         self._hardmask = True
 
     def soften_mask(self):
@@ -9618,7 +9677,7 @@ def soften_mask(self):
         [  1 999   3]
 
         """
-        self._map_blocks(_soften_mask_chunk, dtype=self.dtype)
+        self._map_blocks(_cf_soften_mask, dtype=self.dtype)
         self._hardmask = False
 
     @_inplace_enabled(default=False)
@@ -10762,7 +10821,7 @@ def flip(self, axes=None, inplace=False, i=False):
 
         dx = d._get_dask()
         dx = dx[tuple(index)]
-        d._set_dask(dx)
+        d._set_dask(dx, reset_mask_hardness=False)
 
         return d
 
@@ -12174,7 +12233,7 @@ def squeeze(self, axes=None, inplace=False, i=False):
         # one size 1 axis needs squeezing.
         dx = d._get_dask()
         dx = dx.squeeze(axis=tuple(axes))
-        d._set_dask(dx)
+        d._set_dask(dx, reset_mask_hardness=False)
 
         # Remove the squeezed axes names
         d._axes = [axis for i, axis in enumerate(d._axes) if i not in axes]
@@ -12726,7 +12785,7 @@ def roll(self, axis, shift, inplace=False, i=False):
 
         dx = d._get_dask()
         dx = da.roll(dx, shift, axis=axis)
-        d._set_dask(dx)
+        d._set_dask(dx, reset_mask_hardness=False)
 
         return d
 
@@ -13525,7 +13584,7 @@ def _broadcast(a, shape):
 """Dask utilities to be called on chunks"""
 
 
-def _harden_mask_chunk(a):
+def _cf_harden_mask(a):
     """Harden the mask of a masked `numpy` array.
 
     Has no effect if the array is not a masked array.
@@ -13551,7 +13610,7 @@ def _harden_mask_chunk(a):
     return a
 
 
-def _soften_mask_chunk(a):
+def _cf_soften_mask(a):
     """Soften the mask of a maked `numpy` array.
 
     Has no effect if the array is not a masked array.
diff --git a/cf/data/utils.py b/cf/data/utils.py
index ca1584ba85..fb077ebac9 100644
--- a/cf/data/utils.py
+++ b/cf/data/utils.py
@@ -2,13 +2,11 @@
 from functools import lru_cache, partial
 from itertools import product
 
-import numpy as np
-
 import dask.array as da
+import numpy as np
 
-from ..cfdatetime import dt2rt, st2rt, rt2dt
 from ..cfdatetime import dt as cf_dt
-
+from ..cfdatetime import dt2rt, rt2dt, st2rt
 from ..units import Units
 
 
@@ -379,7 +377,7 @@ def conform_units(value, units):
       unchanged;
 
     * if the value units are equivalent to *units* then a copy of
-      *value* with the same units as data* is returned;
+      *value* convert to *units* is returned;
 
     * if the value units are not equivalent to *units* then an
       exception is raised.

From 59bc506db80b10ac4025bb10cd0dc31691c42bde Mon Sep 17 00:00:00 2001
From: David Hassell <davidhassell@users.noreply.github.com>
Date: Mon, 4 Oct 2021 18:00:54 +0100
Subject: [PATCH 10/12] Typos

Co-authored-by: Sadie L. Bartholomew <sadie.bartholomew@ncas.ac.uk>
---
 cf/data/QUESTIONS.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cf/data/QUESTIONS.rst b/cf/data/QUESTIONS.rst
index 9fcd8e3f12..77b8ebf0ae 100644
--- a/cf/data/QUESTIONS.rst
+++ b/cf/data/QUESTIONS.rst
@@ -11,7 +11,7 @@ Q. When we run something that executes all of the lazy operations
    with a "persisted" version of the computed data? If we did this, we
    would want to have the ability to cache persisted chunks to disk,
    as they came into being on each thread (see, for instance,
-   `chest`). Do do this or not this could be controlled by a
+   `chest`). To do this or not do this could be controlled by a
    configuation setting.
 
 A. ?

From 142556d6bd4f303d3b535ca563852ded136b14b0 Mon Sep 17 00:00:00 2001
From: David Hassell <davidhassell@users.noreply.github.com>
Date: Mon, 4 Oct 2021 18:01:14 +0100
Subject: [PATCH 11/12] Typos

Co-authored-by: Sadie L. Bartholomew <sadie.bartholomew@ncas.ac.uk>
---
 cf/data/QUESTIONS.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cf/data/QUESTIONS.rst b/cf/data/QUESTIONS.rst
index 77b8ebf0ae..9014430d4e 100644
--- a/cf/data/QUESTIONS.rst
+++ b/cf/data/QUESTIONS.rst
@@ -1,7 +1,7 @@
 Questions and answers
 =====================
 
-A place to record randoms thought about the daskification of
+A place to record random thoughts about the daskification of
 `cf.Data`, possibly prior to starting an issue on GitHub.
 
 ----

From aecd89b5423d4c0b546c5f2da91321b05da7af95 Mon Sep 17 00:00:00 2001
From: David Hassell <davidhassell@users.noreply.github.com>
Date: Mon, 4 Oct 2021 18:24:02 +0100
Subject: [PATCH 12/12] Correct docstring

Co-authored-by: Sadie L. Bartholomew <sadie.bartholomew@ncas.ac.uk>
---
 cf/data/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cf/data/data.py b/cf/data/data.py
index 91f228481c..6839572a6a 100644
--- a/cf/data/data.py
+++ b/cf/data/data.py
@@ -1246,7 +1246,7 @@ def __orthogonal_indexing__(self):
     @property
     @daskified(1)
     def __keepdims_indexing__(self):
-        """Flag to indicate whether orthogonal indexing is supported.
+        """Flag to indicate whether dimensions indexed with integers are kept.
 
         If set to True (the default) then providing a single integer
         as a single-axis index does *not* reduce the number of array