diff --git a/.github/workflows/dask-migration-testing.yml b/.github/workflows/dask-migration-testing.yml new file mode 100644 index 0000000000..45528f82a4 --- /dev/null +++ b/.github/workflows/dask-migration-testing.yml @@ -0,0 +1,113 @@ +# A GitHub Action to run test_Data.py only for the 'lama-to-dask' branch +name: Test `cf.Data` during the replacement of LAMA with Dask + +on: + push: + branches: + - lama-to-dask + pull_request: + types: [opened, reopened, ready_for_review] + branches: + - lama-to-dask + +jobs: + test-suite-job-0: + + # Set-up the build matrix. We run on different distros and Python versions. + strategy: + matrix: + # Skip older ubuntu-16.04 & macos-10.15 to save usage resource + os: [ubuntu-latest, macos-latest] + python-version: [3.7, 3.8, 3.9] + + # Run on new and old(er) versions of the distros we support (Linux, Mac OS) + runs-on: ${{ matrix.os }} + + # The sequence of tasks that will be executed as part of this job: + steps: + + - name: Checkout cf-python + uses: actions/checkout@v2 + with: + path: main + + # Provide a notification message + - name: Notify about setup + run: echo Now setting up the environment for the cf-python test suite... + + - name: Checkout the current cfdm master to use as the dependency + uses: actions/checkout@v2 + with: + repository: NCAS-CMS/cfdm + path: cfdm + + # Prepare to run the test-suite on different versions of Python 3: + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + + # Setup conda, which is the simplest way to access all dependencies, + # especially as some are C-based so otherwise difficult to setup. + - name: Setup Miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + miniconda-version: 'latest' + activate-environment: cf-latest + python-version: ${{ matrix.python-version }} + channels: ncas, conda-forge + + # Ensure shell is configured with conda activated: + - name: Check conda config + shell: bash -l {0} + run: | + conda info + conda list + conda config --show-sources + conda config --show + # Install cf-python dependencies, excluding cfdm, pre-testing + # We do so with conda which was setup in a previous step. + - name: Install dependencies + shell: bash -l {0} + run: | + conda install -c ncas -c conda-forge udunits2=2.2.25 + conda install -c conda-forge mpich esmpy + conda install scipy matplotlib dask + pip install pycodestyle + # Install cfdm from master branch, then the cf-python development version + # We do so with conda which was setup in a previous step. + - name: Install development cfdm and cf-python + shell: bash -l {0} + run: | + cd ${{ github.workspace }}/cfdm + pip install -e . + cd ${{ github.workspace }}/main + pip install -e . + # Make UMRead library + - name: Make UMRead + shell: bash -l {0} + run: | + cd ${{ github.workspace }}/main/cf/umread_lib/c-lib + make + # Install the coverage library + # We do so with conda which was setup in a previous step. + - name: Install coverage + shell: bash -l {0} + run: | + conda install coverage + # Provide another notification message + - name: Notify about starting testing + run: echo Setup complete. Now starting to run the cf-python test suite... + + # Finally run test_Data.py! + - name: Run the test_Data test module + shell: bash -l {0} + run: | + cd ${{ github.workspace }}/main/cf/test + python test_Data.py + + # End with a message indicating the suite has completed its run + - name: Notify about a completed run + run: | + echo The test_Data module has run and you can inspect the results. diff --git a/cf/__init__.py b/cf/__init__.py index 2235b28e2b..811368639b 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -81,17 +81,11 @@ """ __Conventions__ = "CF-1.8" -__date__ = "2021-06-10" -__version__ = "3.10.0" - -_requires = ( - "numpy", - "netCDF4", - "cftime", - "cfunits", - "cfdm", - "psutil", -) +__author__ = "David Hassell" +__date__ = "2021-??-??" +__version__ = "4.0.0" + +_requires = ("numpy", "netCDF4", "cftime", "cfunits", "cfdm", "psutil") x = ", ".join(_requires) _error0 = f"cf v{ __version__} requires the modules {x}. " @@ -193,8 +187,8 @@ ) # Check the version of cfdm -_minimum_vn = "1.8.9.0" -_maximum_vn = "1.8.10.0" +_minimum_vn = "1.9.0.1" +_maximum_vn = "1.9.1.0" _cfdm_version = LooseVersion(cfdm.__version__) if not LooseVersion(_minimum_vn) <= _cfdm_version < LooseVersion(_maximum_vn): raise RuntimeError( @@ -243,6 +237,7 @@ RaggedContiguousArray, RaggedIndexedArray, RaggedIndexedContiguousArray, + SubsampledArray, ) from .aggregate import aggregate diff --git a/cf/aggregate.py b/cf/aggregate.py index 66b048e916..9c66666906 100644 --- a/cf/aggregate.py +++ b/cf/aggregate.py @@ -2510,7 +2510,7 @@ def _get_hfl( if d._pmsize == 1: partition = d.partitions.matrix.item() if not partition.part: - key = getattr(partition.subarray, "file_pointer", None) + key = getattr(partition.subarray, "file_address", None) if key is not None: hash_value = hfl_cache.hash.get(key, None) create_hash = hash_value is None diff --git a/cf/bounds.py b/cf/bounds.py index 1668e0565e..55fd97da14 100644 --- a/cf/bounds.py +++ b/cf/bounds.py @@ -212,9 +212,7 @@ def contiguous(self, overlap=True, direction=None, period=None, verbose=1): else: if direction is None: b = data[(0,) * ndim].array - direction = b.item(0,) < b.item( - 1, - ) + direction = b.item(0) < b.item(1) if direction: return (data[1:, 0] <= data[:-1, 1]).all() diff --git a/cf/cfdatetime.py b/cf/cfdatetime.py index a1d77cde54..1c9513af3e 100644 --- a/cf/cfdatetime.py +++ b/cf/cfdatetime.py @@ -385,15 +385,7 @@ def st2elements(date_string): if utc_offset: raise ValueError("Can't specify a time offset from UTC") - return ( - year, - month, - day, - hour, - minute, - second, - microsecond, - ) + return (year, month, day, hour, minute, second, microsecond) def rt2dt(array, units_in, units_out=None, dummy1=None): diff --git a/cf/constants.py b/cf/constants.py index 5a80f4dab3..07fc689ee1 100644 --- a/cf/constants.py +++ b/cf/constants.py @@ -537,32 +537,20 @@ "orog": { "surface_altitude": "altitude", "surface_height_above_geopotential_datum": "height_above_geopotential_datum", - }, + } }, "atmosphere_sleve_coordinate": { "ztop": { "altitude_at_top_of_atmosphere_model": "altitude", "height_above_geopotential_datum_at_top_of_atmosphere_model": "height_above_geopotential_datum", - }, - }, - "ocean_sigma_coordinate": { - "depth": _D1_depth_mapping, - }, - "ocean_s_coordinate": { - "depth": _D1_depth_mapping, - }, - "ocean_s_coordinate_g1": { - "depth": _D1_depth_mapping, - }, - "ocean_s_coordinate_g2": { - "depth": _D1_depth_mapping, - }, - "ocean_sigma_z_coordinate": { - "depth": _D1_depth_mapping, - }, - "ocean_double_sigma_coordinate": { - "depth": _D1_depth_mapping, - }, + } + }, + "ocean_sigma_coordinate": {"depth": _D1_depth_mapping}, + "ocean_s_coordinate": {"depth": _D1_depth_mapping}, + "ocean_s_coordinate_g1": {"depth": _D1_depth_mapping}, + "ocean_s_coordinate_g2": {"depth": _D1_depth_mapping}, + "ocean_sigma_z_coordinate": {"depth": _D1_depth_mapping}, + "ocean_double_sigma_coordinate": {"depth": _D1_depth_mapping}, } # -------------------------------------------------------------------- @@ -570,15 +558,8 @@ # Appendix D: Parametric Vertical Coordinates of the CF conventions. # -------------------------------------------------------------------- formula_terms_units = { - "atmosphere_ln_pressure_coordinate": { - "p0": "Pa", - "lev": "", - }, - "atmosphere_sigma_coordinate": { - "sigma": "", - "ptop": "Pa", - "ps": "Pa", - }, + "atmosphere_ln_pressure_coordinate": {"p0": "Pa", "lev": ""}, + "atmosphere_sigma_coordinate": {"sigma": "", "ptop": "Pa", "ps": "Pa"}, "atmosphere_hybrid_sigma_pressure_coordinate": { "p0": "Pa", "ps": "Pa", @@ -586,11 +567,7 @@ "a": "", "b": "", }, - "atmosphere_hybrid_height_coordinate": { - "a": "m", - "b": "", - "orog": "m", - }, + "atmosphere_hybrid_height_coordinate": {"a": "m", "b": "", "orog": "m"}, "atmosphere_sleve_coordinate": { "ztop": "m", "a": "", @@ -599,11 +576,7 @@ "zsurf1": "m", "zsurf2": "m", }, - "ocean_sigma_coordinate": { - "eta": "m", - "depth": "m", - "sigma": "", - }, + "ocean_sigma_coordinate": {"eta": "m", "depth": "m", "sigma": ""}, "ocean_s_coordinate": { "eta": "m", "depth": "m", diff --git a/cf/constructlist.py b/cf/constructlist.py index 3ddb57f8a1..a545c14e36 100644 --- a/cf/constructlist.py +++ b/cf/constructlist.py @@ -96,14 +96,7 @@ def __docstring_method_exclusions__(self): See `_docstring_method_exclusions` for details. """ - return ( - "append", - "extend", - "insert", - "pop", - "reverse", - "clear", - ) + return ("append", "extend", "insert", "pop", "reverse", "clear") # ---------------------------------------------------------------- # Overloaded list methods diff --git a/cf/data/QUESTIONS.rst b/cf/data/QUESTIONS.rst new file mode 100644 index 0000000000..9014430d4e --- /dev/null +++ b/cf/data/QUESTIONS.rst @@ -0,0 +1,25 @@ +Questions and answers +===================== + +A place to record random thoughts about the daskification of +`cf.Data`, possibly prior to starting an issue on GitHub. + +---- + +Q. When we run something that executes all of the lazy operations + (like `cf.Data.is_masked`), should/could we replace the dask array + with a "persisted" version of the computed data? If we did this, we + would want to have the ability to cache persisted chunks to disk, + as they came into being on each thread (see, for instance, + `chest`). To do this or not do this could be controlled by a + configuation setting. + +A. ? + +---- + +Q. + +A. ? + +---- diff --git a/cf/data/README.rst b/cf/data/README.rst new file mode 100644 index 0000000000..6e404b720e --- /dev/null +++ b/cf/data/README.rst @@ -0,0 +1,24 @@ +`cf.Data` developer notes +========================= + +Hardness of the mask +-------------------- + +Any `cf.Data` method that changes the dask array should consider +whether or not the mask hardness needs resetting before +returning. This will be necessary if there is the possibility that the +operation being applied to the dask array could lose the "memory" on +its chunks of whether or not the mask is hard. + +A common situation that causes a chunk to lose its memory of whether +or not the mask is hard is when a chunk could have contained a +unmasked `numpy` array prior to the operation, but the operation could +convert it to a masked `numpy` array. The new masked array will always +have the `numpy` default hardness (i.e. soft), which may be +incorrect. + +The mask hardness is most easily reset with the +`cf.Data._reset_mask_hardness` method. + +`cf.Data.__setitem__` and `cf.Data.where` are examples of methods that +need to reset the mask in this manner. diff --git a/cf/data/__init__.py b/cf/data/__init__.py index f34ab86576..43c54813c6 100644 --- a/cf/data/__init__.py +++ b/cf/data/__init__.py @@ -1,16 +1,18 @@ from .cachedarray import CachedArray from .netcdfarray import NetCDFArray -from .filledarray import FilledArray from .umarray import UMArray +from .filledarray import FilledArray + from .gatheredarray import GatheredArray from .raggedcontiguousarray import RaggedContiguousArray from .raggedindexedarray import RaggedIndexedArray from .raggedindexedcontiguousarray import RaggedIndexedContiguousArray +from .subsampledarray import SubsampledArray -from .gatheredsubarray import GatheredSubarray -from .raggedcontiguoussubarray import RaggedContiguousSubarray -from .raggedindexedsubarray import RaggedIndexedSubarray -from .raggedindexedcontiguoussubarray import RaggedIndexedContiguousSubarray +# from .gatheredsubarray import GatheredSubarray +# from .raggedcontiguoussubarray import RaggedContiguousSubarray +# from .raggedindexedsubarray import RaggedIndexedSubarray +# from .raggedindexedcontiguoussubarray import RaggedIndexedContiguousSubarray from .data import Data diff --git a/cf/data/abstract/__init__.py b/cf/data/abstract/__init__.py index 6a5ada3777..ed8033a8e0 100644 --- a/cf/data/abstract/__init__.py +++ b/cf/data/abstract/__init__.py @@ -1,3 +1,2 @@ from .array import Array -from .compressedsubarray import CompressedSubarray from .filearray import FileArray diff --git a/cf/data/abstract/array.py b/cf/data/abstract/array.py index ea6a345c19..bf4d89114b 100644 --- a/cf/data/abstract/array.py +++ b/cf/data/abstract/array.py @@ -1,7 +1,9 @@ import cfdm +from ..mixin import ArrayMixin -class Array(cfdm.Array): + +class Array(ArrayMixin, cfdm.Array): """Abstract base class for a container of an underlying array. The form of the array is defined by the initialization parameters @@ -10,16 +12,3 @@ class Array(cfdm.Array): .. versionadded:: 3.0.0 """ - - def __repr__(self): - """Called by the `repr` built-in function. - - x.__repr__() <==> repr(x) - - .. versionadded:: 3.0.0 - - """ - return super().__repr__().replace("<", " x[indices] - - Returns a numpy array. - - """ - raise NotImplementedError() # pragma: no cover - - def __repr__(self): - """x.__repr__() <==> repr(x)""" - array = self.array - shape = str(array.shape) - shape = shape.replace(",)", ")") - - return "".format( - self.__class__.__name__, shape, str(array) - ) - - @property - def dtype(self): - return self.array.dtype - - @property - def file(self): - """The file on disk which contains the compressed array, or - `None` of the array is in memory. - - **Examples:** - - >>> self.file - '/home/foo/bar.nc' - - """ - return getattr(self.array, "file", None) - - def close(self): - """Close all referenced open files. - - :Returns: - - `None` - - **Examples:** - - >>> f.close() - - """ - if self.on_disk(): - self.array.close() - - def copy(self): - """Replace the abstract base class with a deep copy.""" - C = self.__class__ - new = C.__new__(C) - new.__dict__ = self.__dict__.copy() - return new - - def inspect(self): - """Inspect the object for debugging. - - .. seealso:: `cf.inspect` - - :Returns: - - `None` - - """ - print(cf_inspect(self)) - - def on_disk(self): - """True if and only if the compressed array is on disk as - opposed to in memory. - - **Examples:** - - >>> a.on_disk() - True - - """ - return not hasattr(self.array, "__array_interface__") - - def unique(self): - """True if there is only one permanent reference to the array - instance.""" - # Note, from the Python docs for sys.getrefcount: - # "The count returned is generally one higher than you might expect, - # because it includes the (temporary) reference as an argument to - # getrefcount", hence <= 2 to test for uniqueness rather than <= 1. - return getrefcount(self.array) <= 2 - - -# --- End: class diff --git a/cf/data/abstract/filearray.py b/cf/data/abstract/filearray.py index 433070498d..93b1dbd950 100644 --- a/cf/data/abstract/filearray.py +++ b/cf/data/abstract/filearray.py @@ -3,138 +3,38 @@ class FileArray(Array): - """A sub-array stored in a file. - - .. note:: Subclasses must define the following methods: - `!__getitem__`, `!__str__`, `!close` and `!open`. - - """ + """An array stored in a file.""" def __getitem__(self, indices): - """TODO.""" - pass - - def __str__(self): - """x.__str__() <==> str(x)""" - return "%s in %s" % (self.shape, self.file) - - # ---------------------------------------------------------------- - # Attributes - # ---------------------------------------------------------------- - @property - def dtype(self): - """Data-type of the data elements. + """Return a subspace of the array. - **Examples:** + x.__getitem__(indices) <==> x[indices] - >>> a.dtype - dtype('float64') - >>> print(type(a.dtype)) - + Returns a subspace of the array as an independent numpy array. """ - return self._get_component("dtype") + raise NotImplementedError( + f"Must implement {self.__class__.__name__}.__getitem__" + ) # pragma: no cover - @property - def ndim(self): - """Number of array dimensions. - - **Examples:** - - >>> a.shape - (73, 96) - >>> a.ndim - 2 - >>> a.size - 7008 - - >>> a.shape - (1, 1, 1) - >>> a.ndim - 3 - >>> a.size - 1 - - >>> a.shape - () - >>> a.ndim - 0 - >>> a.size - 1 - - """ - return self._get_component("ndim") + def __str__(self): + """x.__str__() <==> str(x)""" + return f"<{self.__class__.__name__}: {self.shape} in {self.file}" @property - def shape(self): - """Tuple of array dimension sizes. - - **Examples:** - - >>> a.shape - (73, 96) - >>> a.ndim - 2 - >>> a.size - 7008 - - >>> a.shape - (1, 1, 1) - >>> a.ndim - 3 - >>> a.size - 1 - - >>> a.shape - () - >>> a.ndim - 0 - >>> a.size - 1 + def _lock(self): + """TODODASK. - """ - return self._get_component("shape") + Concurrent reads are assumed to be supported. - @property - def size(self): - """Number of elements in the array. - - **Examples:** - - >>> a.shape - (73, 96) - >>> a.size - 7008 - >>> a.ndim - 2 - - >>> a.shape - (1, 1, 1) - >>> a.ndim - 3 - >>> a.size - 1 - - >>> a.shape - () - >>> a.ndim - 0 - >>> a.size - 1 + .. versionadded:: (cfdm) 1.9.TODO.0 """ - return self._get_component("size") + return False @property def filename(self): - """The name of the file containing the array. - - **Examples:** - - >>> a.filename() - 'file.nc' - - """ + """The name of the file containing the array.""" return self._get_component("filename") @property @@ -146,7 +46,7 @@ def array(self): `numpy.ndarray` An independent numpy array of the data. - **Examples:** + **Examples** >>> n = numpy.asanyarray(a) >>> isinstance(n, numpy.ndarray) @@ -155,6 +55,12 @@ def array(self): """ return self[...] + def close(self): + """Close the dataset containing the data.""" + raise NotImplementedError( + f"Must implement {self.__class__.__name__}.close" + ) # pragma: no cover + def inspect(self): """Inspect the object for debugging. @@ -170,7 +76,12 @@ def inspect(self): def get_filename(self): """Return the name of the file containing the array. - **Examples:** + :Returns: + + `str` + The file name. + + **Examples** >>> a.get_filename() 'file.nc' @@ -178,13 +89,8 @@ def get_filename(self): """ return self._get_component("filename") - def close(self): - pass - def open(self): - pass - - -# --- End: class - -# Array.register(FileArray) + """Returns an open dataset containing the data array.""" + raise NotImplementedError( + f"Must implement {self.__class__.__name__}.open" + ) # pragma: no cover diff --git a/cf/data/collapse_functions.py b/cf/data/collapse_functions.py index 5fa40d90cc..79017e4ace 100644 --- a/cf/data/collapse_functions.py +++ b/cf/data/collapse_functions.py @@ -1301,7 +1301,16 @@ def var_ffinalise(out, sub_samples=None): # Currently: var = SUM(pV1(pv+pm**2) # avg = V1*m # - # http://en.wikipedia.org/wiki/Standard_deviation#Population-based_statistics + # https://en.wikipedia.org/wiki/Pooled_variance#Population-based_statistics + # + # For the general case of M non-overlapping data sets, X_{1} + # through X_{M}, and the aggregate data set X=\bigcup_{i}X_{i} + # we have the unweighted mean and variance is: + # + # \mu_{X}={\frac{1}{\sum_{i}{N_{X_{i}}}}}\left(\sum_{i}{N_{X_{i}}\mu_{X_{i}}}\right) + # + # var_{X}={{\frac{1}{\sum_{i}{N_{X_{i}}-ddof}}}\left(\sum_{i}{\left[(N_{X_{i}}-1)\sigma_{X_{i}}^{2}+N_{X_{i}}\mu_{X_{i}}^{2}\right]}-\left[\sum_{i}{N_{X_{i}}}\right]\mu_{X}^{2}\right)} + # # ---------------------------------------------------------------- avg /= V1 avg *= avg diff --git a/cf/data/creation.py b/cf/data/creation.py new file mode 100644 index 0000000000..44c4b63ca7 --- /dev/null +++ b/cf/data/creation.py @@ -0,0 +1,372 @@ +"""Functions used during the creation of `Data` objects.""" +from functools import lru_cache, partial + +import dask.array as da +import numpy as np +from dask import config +from dask.array.core import getter, normalize_chunks +from dask.base import tokenize +from dask.utils import SerializableLock + + +def convert_to_builtin_type(x): + """Convert a non-JSON-encodable object to a JSON-encodable built-in + type. + + Possible conversions are: + + ================ ======= ================================ + Input Output `numpy` data-types covered + ================ ======= ================================ + `numpy.bool_` `bool` bool + `numpy.integer` `int` int, int8, int16, int32, int64, + uint8, uint16, uint32, uint64 + `numpy.floating` `float` float, float16, float32, float64 + ================ ======= ================================ + + .. versionadded:: 4.0.0 + + :Parameters: + + x: + TODO + + :Returns: + + TODO + + **Examples** + + >>> type(_convert_to_netCDF_datatype(numpy.bool_(True))) + bool + >>> type(_convert_to_netCDF_datatype(numpy.array([1.0])[0])) + double + >>> type(_convert_to_netCDF_datatype(numpy.array([2])[0])) + int + + """ + if isinstance(x, np.bool_): + return bool(x) + + if isinstance(x, np.integer): + return int(x) + + if isinstance(x, np.floating): + return float(x) + + raise TypeError(f"{type(x)!r} object is not JSON serializable: {x!r}") + + +def to_dask(array, chunks, dask_from_array_options): + """TODODASK. + + .. versionadded:: TODODASK + + :Parameters: + + array: array_like + + chunks: `int`, `tuple`, `dict` or `str`, optional + Specify the chunking of the returned dask array. + + Any value accepted by the *chunks* parameter of the + `dask.array.from_array` function is allowed. + + dask_from_array_options: `dict` + Keyword arguments to be passed to `dask.array.from_array`. + + :Returns: + + `dask.array.Array` + + **Examples** + + >>> to_dask([1, 2, 3]) + dask.array + >>> to_dask([1, 2, 3], chunks=2) + dask.array + >>> to_dask([1, 2, 3], chunks=2, {'asarray': True}) + dask.array + + """ + if "chunks" in dask_from_array_options: + raise TypeError( + "Can't define 'chunks' in the 'dask_from_array_options' " + "dictionary. Use the 'chunks' parameter instead." + ) + + kwargs = dask_from_array_options.copy() + kwargs.setdefault("lock", getattr(array, "_lock", True)) + + return da.from_array(array, chunks=chunks, **kwargs) + + +def compressed_to_dask(array, chunks): + """TODODASK Create and insert a partition matrix for a compressed + array. + + .. versionadded:: TODODASK + + :Parameters: + + array: subclass of `Array` + The compressed array. + + chunks: `int`, `tuple`, `dict` or `str`, optional + Specify the chunking of the returned dask array. + + Any value accepted by the *chunks* parameter of the + `dask.array.from_array` function is allowed. + + The chunk sizes implied by *chunks* for a dimension that + has been compressed are ignored and replaced with values + that implied by the decompression algorithm, so their + specification is arbitrary. + + :Returns: + + `dask.array.Array` + + """ + # Initialise a dask graph for the uncompressed array + name = (array.__class__.__name__ + "-" + tokenize(array),) + dsk = {} + full_slice = Ellipsis + + # A context manager that is used to ensure that all data accessed + # from within a `Subarray` instance is done so synchronously, + # thereby avoiding any "compute within a compute" thread + # proliferation. + context = partial(config.set, scheduler="synchronous") + + compressed_dimensions = array.compressed_dimensions() + conformed_data = array.conformed_data() + compressed_data = conformed_data["data"] + + # ---------------------------------------------------------------- + # Set the chunk sizes for the dask array. + # + # Note: The chunk sizes implied by the input 'chunks' for a + # dimension that has been compressed are ignored in favour + # of those created by 'array.subarray_shapes'. For + # subsampled arrays, such chunk sizes will be incorrect and + # must be corrected later. + # + # ---------------------------------------------------------------- + uncompressed_dtype = array.dtype + chunks = normalize_chunks( + array.subarray_shapes(chunks), + shape=array.shape, + dtype=uncompressed_dtype, + ) + + # Get the (cfdm) subarray class + Subarray = array.get_Subarray() + + compression_type = array.get_compression_type() + if compression_type.startswith("ragged"): + # ------------------------------------------------------------ + # Ragged + # ------------------------------------------------------------ + for u_indices, u_shape, c_indices, chunk_location in zip( + *array.subarrays(shapes=chunks) + ): + subarray = Subarray( + data=compressed_data, + indices=c_indices, + shape=u_shape, + compressed_dimensions=compressed_dimensions, + context_manager=context, + ) + + dsk[name + chunk_location] = ( + getter, + subarray, + full_slice, + False, + False, + ) + + elif compression_type == "gathered": + # ------------------------------------------------------------ + # Gathered + # ------------------------------------------------------------ + uncompressed_indices = conformed_data["uncompressed_indices"] + + for u_indices, u_shape, c_indices, chunk_location in zip( + *array.subarrays(shapes=chunks) + ): + subarray = Subarray( + data=compressed_data, + indices=c_indices, + shape=u_shape, + compressed_dimensions=compressed_dimensions, + uncompressed_indices=uncompressed_indices, + context_manager=context, + ) + + dsk[name + chunk_location] = ( + getter, + subarray, + full_slice, + False, + False, + ) + + elif compression_type == "subsampled": + # ------------------------------------------------------------ + # Subsampled + # + # Note: The chunks created above are incorrect for the + # compressed dimensions, since these chunk sizes are a + # function of the tie point indices which haven't yet + # been accessed. Therefore, the chunks for the + # compressed dimensons must be redefined here. + # + # ------------------------------------------------------------ + + # Re-initialise the chunks + u_dims = list(compressed_dimensions) + chunks = [[] if i in u_dims else c for i, c in enumerate(chunks)] + previous_chunk_location = [-1] * len(chunks) + + parameters = conformed_data["parameters"] + dependent_tie_points = conformed_data["dependent_tie_points"] + + for ( + u_indices, + u_shape, + c_indices, + subarea_indices, + first, + chunk_location, + ) in zip(*array.subarrays(shapes=chunks)): + subarray = Subarray( + data=compressed_data, + indices=c_indices, + shape=u_shape, + compressed_dimensions=compressed_dimensions, + first=first, + subarea_indices=subarea_indices, + parameters=parameters, + dependent_tie_points=dependent_tie_points, + context_manager=context, + ) + + dsk[name + chunk_location] = ( + getter, + subarray, + full_slice, + False, + False, + ) + + # Add correct chunk sizes + for d in u_dims[:]: + previous = previous_chunk_location[d] + new = chunk_location[d] + if new > previous: + chunks[d].append(u_shape[d]) + previous_chunk_location[d] = new + elif new < previous: + # No more chunk sizes required for this dimension + u_dims.remove(d) + + chunks = [tuple(c) for c in chunks] + + else: + raise ValueError( + f"Can't initialise 'Data' from compressed {array!r} with " + f"unknown compression type {compression_type!r}" + ) + + # Return the dask array + return da.Array(dsk, name[0], chunks=chunks, dtype=uncompressed_dtype) + + +@lru_cache(maxsize=32) +def generate_axis_identifiers(n): + """Return new, unique axis identifiers for a given number of axes. + + The names are arbitrary and have no semantic meaning. + + .. versionadded:: TODODASK + + :Parameters: + + n: `int` + Generate this number of axis identifiers. + + :Returns: + + `list` + The new axis idenfifiers. + + **Examples** + + >>> generate_axis_identifiers(0) + [] + >>> generate_axis_identifiers(1) + ['dim0'] + >>> generate_axis_identifiers(3) + ['dim0', 'dim1', 'dim2'] + + """ + return [f"dim{i}" for i in range(n)] + + +def threads(): + """Return True if the threaded scheduler executes computations. + + See https://docs.dask.org/en/latest/scheduling.html for details. + + .. versionadded:: TODODASK + + """ + return config.get("scheduler", default=None) in (None, "threads") + + +def processes(): + """Return True if the multiprocessing scheduler executes + computations. + + See https://docs.dask.org/en/latest/scheduling.html for details. + + .. versionadded:: TODODASK + + """ + return config.get("scheduler", default=None) == "processes" + + +def synchronous(): + """Return True if the single-threaded synchronous scheduler executes + computations computations in the local thread with no parallelism at + all. + + See https://docs.dask.org/en/latest/scheduling.html for details. + + .. versionadded:: TODODASK + + """ + return config.get("scheduler", default=None) == "synchronous" + + +def get_lock(): + """TODODASK. + + See https://docs.dask.org/en/latest/scheduling.html for details. + + .. versionadded:: TODODASK + + """ + if threads(): + return SerializableLock() + + if synchronous(): + return False + + if processes(): + raise ValueError("TODODASK - not yet sorted out processes lock") + # Do we even need one? Can't we have lock=False, here? + + raise ValueError("TODODASK - what now? raise exception? cluster?") diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py new file mode 100644 index 0000000000..7a6c4dcc07 --- /dev/null +++ b/cf/data/dask_utils.py @@ -0,0 +1,155 @@ +"""Functions intended to be passed to be dask. + +These will typically be functions that operate on dask chunks. For +instance, as would be passed to `dask.array.map_blocks`. + +""" + +import numpy as np + + +def cf_harden_mask(a): + """Harden the mask of a masked `numpy` array. + + Has no effect if the array is not a masked array. + + .. versionadded:: TODODASK + + .. seealso:: `cf.Data.harden_mask` + + :Parameters: + + a: `numpy.ndarray` + The array to have a hardened mask. + + :Returns: + + `numpy.ndarray` + The array with hardened mask. + + """ + if np.ma.isMA(a): + a.harden_mask() + + return a + + +def cf_soften_mask(a): + """Soften the mask of a masked `numpy` array. + + Has no effect if the array is not a masked array. + + .. versionadded:: TODODASK + + .. seealso:: `cf.Data.soften_mask` + + :Parameters: + + a: `numpy.ndarray` + The array to have a softened mask. + + :Returns: + + `numpy.ndarray` + The array with softened mask. + + """ + if np.ma.isMA(a): + a.soften_mask() + + return a + + +def cf_where(array, condition, x, y, hardmask): + """Set elements of *array* from *x* or *y* depending on *condition*. + + The input *array* is not changed in-place. + + See `where` for details on the expected functionality. + + .. note:: This function correctly sets the mask hardness of the + output array. + + .. versionadded:: TODODASK + + .. seealso:: `cf.Data.where` + + :Parameters: + + array: numpy.ndarray + The array to be assigned to. + + condition: numpy.ndarray + Where False or masked, assign from *y*, otherwise assign + from *x*. + + x: numpy.ndarray or `None` + *x* and *y* must not both be `None`. + + y: numpy.ndarray or `None` + *x* and *y* must not both be `None`. + + hardmask: `bool` + Set the mask hardness for a returned masked array. If True + then a returned masked array will have a hardened mask, and + the mask of the input *array* (if there is one) will be + applied to the returned array, in addition to any masked + elements arising from assignments from *x* or *y*. + + :Returns: + + `numpy.ndarray` + A copy of the input *array* with elements from *y* where + *condition* is False or masked, and elements from *x* + elsewhere. + + """ + mask = None + + if np.ma.isMA(array): + # Do a masked where + where = np.ma.where + if hardmask: + mask = array.mask + elif np.ma.isMA(x) or np.ma.isMA(y): + # Do a masked where + where = np.ma.where + else: + # Do a non-masked where + where = np.where + hardmask = False + + condition_is_masked = np.ma.isMA(condition) + if condition_is_masked: + condition = condition.astype(bool) + + if x is not None: + # Assign values from x + if condition_is_masked: + # Replace masked elements of condition with False, so that + # masked locations are assigned from array + c = condition.filled(False) + else: + c = condition + + array = where(c, x, array) + + if y is not None: + # Assign values from y + if condition_is_masked: + # Replace masked elements of condition with True, so that + # masked locations are assigned from array + c = condition.filled(True) + else: + c = condition + + array = where(c, array, y) + + if hardmask: + if mask is not None and mask.any(): + # Apply the mask from the input array to the result + array.mask |= mask + + array.harden_mask() + + return array diff --git a/cf/data/data.py b/cf/data/data.py index 971ef815a6..04f1672dc1 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1,12 +1,12 @@ -import itertools import logging +import math import operator -from functools import reduce as functools_reduce +from functools import partial, reduce, wraps +from itertools import product from json import dumps as json_dumps from json import loads as json_loads -from math import ceil as math_ceil -from operator import itemgetter -from operator import mul as operator_mul +from numbers import Integral +from operator import mul try: from scipy.ndimage.filters import convolve1d as scipy_convolve1d @@ -15,86 +15,15 @@ import cfdm import cftime -import numpy - -# from numpy import arctan2 as numpy_arctan2 TODO AT2 -from numpy import arange as numpy_arange -from numpy import arccos as numpy_arccos -from numpy import arccosh as numpy_arccosh -from numpy import arcsin as numpy_arcsin -from numpy import arcsinh as numpy_arcsinh -from numpy import arctan as numpy_arctan -from numpy import arctanh as numpy_arctanh -from numpy import array as numpy_array -from numpy import asanyarray as numpy_asanyarray -from numpy import bool_ as numpy_bool_ -from numpy import ceil as numpy_ceil -from numpy import cos as numpy_cos -from numpy import cosh as numpy_cosh -from numpy import cumsum as numpy_cumsum -from numpy import diff as numpy_diff -from numpy import digitize as numpy_digitize -from numpy import dtype as numpy_dtype -from numpy import empty as numpy_empty -from numpy import errstate as numpy_errstate -from numpy import exp as numpy_exp -from numpy import expand_dims as numpy_expand_dims -from numpy import finfo as numpy_finfo -from numpy import floating as numpy_floating -from numpy import floor as numpy_floor -from numpy import integer as numpy_integer -from numpy import isnan as numpy_isnan -from numpy import linspace as numpy_linspace -from numpy import log as numpy_log -from numpy import log2 as numpy_log2 -from numpy import log10 as numpy_log10 -from numpy import nan as numpy_nan -from numpy import nanpercentile as numpy_nanpercentile -from numpy import ndarray as numpy_ndarray -from numpy import ndenumerate as numpy_ndenumerate -from numpy import ndim as numpy_ndim -from numpy import ndindex as numpy_ndindex -from numpy import newaxis as numpy_newaxis -from numpy import ones as numpy_ones -from numpy import percentile as numpy_percentile -from numpy import prod as numpy_prod -from numpy import ravel_multi_index as numpy_ravel_multi_index -from numpy import reshape as numpy_reshape -from numpy import result_type as numpy_result_type -from numpy import rint as numpy_rint -from numpy import round as numpy_round -from numpy import seterr as numpy_seterr -from numpy import shape as numpy_shape -from numpy import sin as numpy_sin -from numpy import sinh as numpy_sinh -from numpy import size as numpy_size -from numpy import tan as numpy_tan -from numpy import tanh as numpy_tanh -from numpy import tile as numpy_tile -from numpy import trunc as numpy_trunc -from numpy import unique as numpy_unique -from numpy import unravel_index as numpy_unravel_index -from numpy import vectorize as numpy_vectorize -from numpy import where as numpy_where -from numpy import zeros as numpy_zeros -from numpy.ma import MaskedArray as numpy_ma_MaskedArray -from numpy.ma import array as numpy_ma_array -from numpy.ma import count as numpy_ma_count -from numpy.ma import empty as numpy_ma_empty -from numpy.ma import filled as numpy_ma_filled -from numpy.ma import is_masked as numpy_ma_is_masked -from numpy.ma import isMA as numpy_ma_isMA -from numpy.ma import masked as numpy_ma_masked -from numpy.ma import masked_all as numpy_ma_masked_all -from numpy.ma import masked_invalid as numpy_ma_masked_invalid -from numpy.ma import masked_where as numpy_ma_masked_where -from numpy.ma import nomask as numpy_ma_nomask -from numpy.ma import where as numpy_ma_where +import dask.array as da +import numpy as np + +# from dask.array.core import slices_from_chunks +from dask.base import is_dask_collection from numpy.testing import suppress_warnings as numpy_testing_suppress_warnings -from .. import mpi_on # TODODASK : remove when move to dask is complete from ..cfdatetime import dt as cf_dt -from ..cfdatetime import dt2rt, rt2dt, st2rt +from ..cfdatetime import dt2rt, rt2dt # , st2rt from ..constants import masked as cf_masked from ..decorators import ( _deprecated_kwarg_check, @@ -103,40 +32,23 @@ _inplace_enabled_define_and_cleanup, _manage_log_level_via_verbosity, ) -from ..functions import ( - _DEPRECATION_ERROR_ATTRIBUTE, - _DEPRECATION_ERROR_METHOD, - _numpy_allclose, - _numpy_isclose, - _section, - abspath, -) +from ..functions import _numpy_allclose, _numpy_isclose, _section, abspath from ..functions import atol as cf_atol from ..functions import broadcast_array from ..functions import chunksize as cf_chunksize -from ..functions import collapse_parallel_mode, default_netCDF_fillvals +from ..functions import default_netCDF_fillvals from ..functions import fm_threshold as cf_fm_threshold from ..functions import free_memory, hash_array from ..functions import inspect as cf_inspect -from ..functions import parse_indices, pathjoin +from ..functions import log_level, parse_indices, pathjoin from ..functions import rtol as cf_rtol from ..mixin_container import Container from ..units import Units -from . import ( - GatheredSubarray, - NetCDFArray, - RaggedContiguousSubarray, - RaggedIndexedContiguousSubarray, - RaggedIndexedSubarray, - UMArray, -) -from .collapse_functions import ( +from . import NetCDFArray, UMArray +from .collapse_functions import ( # max_f,; max_ffinalise,; max_fpartial, max_abs_f, max_abs_ffinalise, max_abs_fpartial, - max_f, - max_ffinalise, - max_fpartial, mean_abs_f, mean_abs_ffinalise, mean_abs_fpartial, @@ -180,75 +92,77 @@ var_ffinalise, var_fpartial, ) +from .creation import ( + compressed_to_dask, + convert_to_builtin_type, + generate_axis_identifiers, + to_dask, +) +from .dask_utils import cf_harden_mask, cf_soften_mask, cf_where from .filledarray import FilledArray -from .partition import Partition -from .partitionmatrix import PartitionMatrix - -if mpi_on: # TODODASK : remove when move to dask is complete - from mpi4py.MPI import SUM as mpi_sum - - from .. import mpi_comm, mpi_rank, mpi_size - +from .mixin import DataClassDeprecationsMixin +from .utils import ( # is_small,; is_very_small, + conform_units, + convert_to_datetime, + convert_to_reftime, + dask_compatible, + first_non_missing_value, + new_axis_identifier, + scalar_masked_array, +) logger = logging.getLogger(__name__) +daskified_log_level = 0 -# -------------------------------------------------------------------- -# Constants -# -------------------------------------------------------------------- -_year_length = 365.242198781 -_month_length = _year_length / 12 - - -def _convert_to_builtin_type(x): - """Convert a non-JSON-encodable object to a JSON-encodable built-in - type. - - Possible conversions are: - ================ ======= ================================ - Input Output `numpy` data-types covered - ================ ======= ================================ - `numpy.bool_` `bool` bool - `numpy.integer` `int` int, int8, int16, int32, int64, - uint8, uint16, uint32, uint64 - `numpy.floating` `float` float, float16, float32, float64 - ================ ======= ================================ +def daskified(apply_temp_log_level=None): + def decorator(method): + """Temporary decorator to mark and log methods migrated to Dask. - :Parameters: + A log level argument will set the log level throughout the call of + the method to that level and then reset it back to the previous + global level. A message will also be emitted to indicate whenever + the method is called, unless no argument is given [daskified()] + in which case the decorator does nothing except mark methods + which are considered to be daskified, a main purpose for this + decorator. - x: - `numpy.bool_` or `numpy.integer` or `numpy.floating` - The object of some numpy primitive data type. + Note: for properties the decorator must be placed underneath the + property decorator so it is called before and not after it. - :Returns: + """ - `bool` or `int` or `float` - The object converted to a JSON-encodable type. + @wraps(method) + def wrapper(*args, **kwargs): + if apply_temp_log_level is None: # distingush from 0 + return method(*args, **kwargs) - **Examples:** + original_global_log_level = log_level() + # Switch log level for the duration of the method call, with an + # initial message to indicate a run first guaranteed to show + log_level(apply_temp_log_level) + # Not actually a warning, but setting as warning ensures it shows + # (unless logging is disabled, but ignore that complication for + # this temporary and informal decorator!) + logger.warning(f"%%%%% Running daskified {method.__name__} %%%%%") - >>> type(_convert_to_builtin_type(numpy.bool_(True))) - bool - >>> type(_convert_to_builtin_type(numpy.array([1.0])[0])) - double - >>> type(_convert_to_builtin_type(numpy.array([2])[0])) - int + out = method(*args, **kwargs) - """ - if isinstance(x, numpy_bool_): - return bool(x) + # ... then return the log level to the global level afterwards + log_level(original_global_log_level) + return out - if isinstance(x, numpy_integer): - return int(x) + return wrapper - if isinstance(x, numpy_floating): - return float(x) + return decorator - raise TypeError( - "{0!r} object is not JSON serializable: {1!r}".format(type(x), x) - ) +# -------------------------------------------------------------------- +# Constants +# -------------------------------------------------------------------- +_year_length = 365.242198781 +_month_length = _year_length / 12 # -------------------------------------------------------------------- # _seterr = How floating-point errors in the results of arithmetic @@ -281,56 +195,21 @@ def _convert_to_builtin_type(x): # -------------------------------------------------------------------- _mask_fpe = [False] -_xxx = numpy_empty((), dtype=object) - _empty_set = set() _units_None = Units() _units_1 = Units("1") _units_radians = Units("radians") -_dtype_object = numpy_dtype(object) -_dtype_float = numpy_dtype(float) -_dtype_bool = numpy_dtype(bool) - -_cached_axes = {0: []} - - -def _initialise_axes(ndim): - """Initialise dimension identifiers of N-d data. - - :Parameters: - - ndim: `int` - The number of dimensions in the data. - - :Returns: - - `list` - The dimension identifiers, one of each dimension in the - array. If the data is scalar thn the list will be empty. - - **Examples:** - - >>> _initialise_axes(0) - [] - >>> _initialise_axes(1) - ['dim1'] - >>> _initialise_axes(3) - ['dim1', 'dim2', 'dim3'] - >>> _initialise_axes(3) is _initialise_axes(3) - True - - """ - axes = _cached_axes.get(ndim, None) - if axes is None: - axes = ["dim%d" % i for i in range(ndim)] - _cached_axes[ndim] = axes +_dtype_float32 = np.dtype("float32") +_dtype_float = np.dtype(float) +_dtype_bool = np.dtype(bool) - return axes +_DEFAULT_CHUNKS = "auto" +_DEFAULT_HARDMASK = True -class Data(Container, cfdm.Data): +class Data(Container, cfdm.Data, DataClassDeprecationsMixin): """An N-dimensional data array with units and masked values. * Contains an N-dimensional, indexable and broadcastable array with @@ -392,14 +271,6 @@ class Data(Container, cfdm.Data): **Cyclic axes** - **Miscellaneous** - - A `Data` object is picklable. - - A `Data` object is hashable, but note that, since it is mutable, its - hash value is only valid whilst the data array is not changed in - place. - """ def __init__( @@ -408,8 +279,8 @@ def __init__( units=None, calendar=None, fill_value=None, - hardmask=True, - chunk=True, + hardmask=_DEFAULT_HARDMASK, + chunks=_DEFAULT_CHUNKS, loadd=None, loads=None, dt=False, @@ -417,6 +288,8 @@ def __init__( copy=True, dtype=None, mask=None, + to_memory=False, + dask_from_array_options={}, _use_array=True, ): """**Initialization** @@ -425,8 +298,7 @@ def __init__( array: optional The array of values. May be any scalar or array-like - object, including another `Data` instance. Ignored if the - *source* parameter is set. + object, including another `Data` instance. *Parameter example:* ``array=[34.6]`` @@ -439,8 +311,7 @@ def __init__( units: `str` or `Units`, optional The physical units of the data. if a `Units` object is - provided then this an also set the calendar. Ignored if - the *source* parameter is set. + provided then this an also set the calendar. The units (without the calendar) may also be set after initialisation with the `set_units` method. @@ -452,8 +323,7 @@ def __init__( ``units='days since 2018-12-01'`` calendar: `str`, optional - The calendar for reference time units. Ignored if the - *source* parameter is set. + The calendar for reference time units. The calendar may also be set after initialisation with the `set_calendar` method. @@ -465,8 +335,7 @@ def __init__( The fill value of the data. By default, or if set to `None`, the `numpy` fill value appropriate to the array's data-type will be used (see - `numpy.ma.default_fill_value`). Ignored if the *source* - parameter is set. + `numpy.ma.default_fill_value`). The fill value may also be set after initialisation with the `set_fill_value` method. @@ -476,7 +345,8 @@ def __init__( dtype: data-type, optional The desired data-type for the data. By default the - data-type will be inferred form the *array* parameter. + data-type will be inferred form the *array* + parameter. The data-type may also be set after initialisation with the `dtype` attribute. @@ -494,11 +364,12 @@ def __init__( mask: optional Apply this mask to the data given by the *array* - parameter. By default, or if *mask* is `None`, no mask is - applied. May be any scalar or array-like object (such as a - `list`, `numpy` array or `Data` instance) that is - broadcastable to the shape of *array*. Masking will be - carried out where the mask elements evaluate to `True`. + parameter. By default, or if *mask* is `None`, no mask + is applied. May be any scalar or array-like object + (such as a `list`, `numpy` array or `Data` instance) + that is broadcastable to the shape of *array*. Masking + will be carried out where the mask elements evaluate + to `True`. This mask will applied in addition to any mask already defined by the *array* parameter. @@ -506,8 +377,10 @@ def __init__( .. versionadded:: 3.0.5 source: optional - Initialize the array, units, calendar and fill value from - those of *source*. + Initialize the data values and metadata (such as + units, mask hardness, etc.) from the data of + *source*. All other arguments, with the exception of + *copy*, are ignored. hardmask: `bool`, optional If False then the mask is soft. By default the mask is @@ -532,504 +405,270 @@ def __init__( If False then do not deep copy input parameters prior to initialization. By default arguments are deep copied. - chunk: `bool`, optional - If False then the data array will be stored in a single - partition. By default the data array will be partitioned - if it is larger than the chunk size, as returned by the - `cf.chunksize` function. - - **Examples:** - - >>> d = cf.Data(5) - >>> d = cf.Data([1,2,3], units='K') - >>> import numpy - >>> d = cf.Data(numpy.arange(10).reshape(2,5), - ... units=Units('m/s'), fill_value=-999) - >>> d = cf.Data(tuple('fly')) - - """ - data = array - - super().__init__(source=source, fill_value=fill_value) - - if source is not None: - partitions = self._custom.get("partitions") - if partitions is not None: - self.partitions = partitions.copy() - - auxiliary_mask = self._custom.get("_auxiliary_mask") - if auxiliary_mask is not None: - self._auxiliary_mask = [mask.copy() for mask in auxiliary_mask] - - return - - if not (loadd or loads): - units = Units(units, calendar=calendar) - self._Units = units - - empty_list = [] - - # The _flip attribute is an unordered subset of the data - # array's axis names. It is a subset of the axes given by the - # _axes attribute. It is used to determine whether or not to - # reverse an axis in each partition's sub-array during the - # creation of the partition's data array. DO NOT CHANGE IN - # PLACE. - self._flip(empty_list) - - # The _all_axes attribute must be None or a tuple - self._all_axes = None + chunks: `int`, `tuple`, `dict` or `str`, optional + Specify the chunking of the underlying dask array. - self.hardmask = hardmask - - # The _HDF_chunks attribute is.... Is either None or a - # dictionary. DO NOT CHANGE IN PLACE. - self._HDF_chunks = None + Any value accepted by the *chunks* parameter of the + `dask.array.from_array` function is allowed. - # ------------------------------------------------------------ - # Attribute: _auxiliary_mask - # - # Must be None or a (possibly empty) list of Data objects. - # ------------------------------------------------------------ - self._auxiliary_mask = None + By default, ``"auto"`` is used to specify the array + chunking, which uses a chunk size in bytes defined by + the configuration value + ``dask.config("array.chunk-size")``, prefering + square-like chunk shapes. - if loadd is not None: - self.loadd(loadd, chunk=chunk) - return + *Parameter example:* + A blocksize like ``1000``. - if loads is not None: - self.loads(loads, chunk=chunk) - return + *Parameter example:* + A blockshape like ``(1000, 1000)``. - # The _cyclic attribute contains the axes of the data array - # which are cyclic (and therefore allow cyclic slicing). It is - # a subset of the axes given by the _axes attribute. DO NOT - # CHANGE IN PLACE. - self._cyclic = _empty_set + *Parameter example:* + Explicit sizes of all blocks along all dimensions + like ``((1000, 1000, 500), (400, 400))``. - data = array + *Parameter example:* + A size in bytes, like ``"100 MiB"`` which will + choose a uniform block-like shape, prefering + square-like chunk shapes. - if data is None: - if dtype is not None: - dtype = numpy_dtype(dtype) + *Parameter example:* + The word ``"auto"`` which acts like a size in bytes, + but uses a chunk size defined by the configuration + value ``dask.config("array.chunk-size")``. - self._dtype = dtype - return + *Parameter example:* + ``-1`` or `None` as a blocksize indicate the size of + the corresponding dimension. - # if not isinstance(data, Array): - if not self._is_abstract_Array_subclass(data): - check_free_memory = True + *Parameter example:* + Blocksizes of some or all dimensions mapped to + dimension positions, like ``{1: 200}``, or ``{0: + -1, 1: (400, 400)}``. - if isinstance(data, self.__class__): - # self.loadd(data.dumpd(), chunk=chunk) - self.__dict__ = data.copy().__dict__ - if chunk: - self.chunk() + .. versionadded:: TODODASK - if mask is not None: - self.where(mask, cf_masked, inplace=True) + to_memory: `bool optional + If True then attempt to read the data that defines + *array* from disk prior to assigning it to the + underlying dask array. This is done by replacing + *array* with the output of its `!to_memory`, if it has + one. Reading data from disk during initialisation will + slow down the initialisation process, but can + considerably improve downstream performance by + avoiding the need for independent reads on every dask + chunk. By default, data of *array* that is on disk + remains there until the required. - return + .. versionadded:: TODODASK - if not isinstance(data, numpy_ndarray): - data = numpy_asanyarray(data) + dask_from_array_options: `dict`, optional + Keyword arguments to pass to `dask.array.from_array`. - if ( - data.dtype.kind == "O" - and not dt - and hasattr(data.item((0,) * data.ndim), "timetuple") - ): - # We've been given one or more date-time objects - dt = True - else: - check_free_memory = False - - _dtype = data.dtype - - if dt or units.isreftime: - # TODO raise exception if compressed - kind = _dtype.kind - if kind in "US": - # Convert date-time strings to reference time floats - if not units: - YMD = str(data.item((0,) * data.ndim)).partition("T")[0] - units = Units("days since " + YMD, units._calendar) - self._Units = units - - data = st2rt(data, units, units) - _dtype = data.dtype - elif kind == "O": - # Convert date-time objects to reference time floats - x = data.item(0) - x_since = "days since " + "-".join( - map(str, (x.year, x.month, x.day)) - ) - x_calendar = getattr(x, "calendar", "gregorian") + .. versionadded:: TODODASK - d_calendar = getattr(self.Units, "calendar", None) - d_units = getattr(self.Units, "units", None) + chunk: deprecated at version TODODASK + Use the *chunks* parameter instead. - if x_calendar != "": - if d_calendar is not None: - if not self.Units.equivalent( - Units(x_since, x_calendar) - ): - raise ValueError( - "Incompatible units: {!r}, {!r}".format( - self.Units, Units(x_since, x_calendar) - ) - ) - else: - d_calendar = x_calendar - # --- End: if + **Examples:** - if not units: - # Set the units to something that is (hopefully) - # close to all of the datetimes, in an attempt to - # reduce errors arising from the conversion to - # reference times - units = Units(x_since, calendar=d_calendar) - else: - units = Units(d_units, calendar=d_calendar) + >>> d = cf.Data(5) + >>> d = cf.Data([1,2,3], units='K') + >>> import numpy + >>> d = cf.Data(numpy.arange(10).reshape(2,5), + ... units=Units('m/s'), fill_value=-999) + >>> d = cf.Data('fly') + >>> d = cf.Data(tuple('fly')) - self._Units = units + """ + if source is None and isinstance(array, self.__class__): + source = array - # Check that all date-time objects have correct and - # equivalent calendars - calendars = set( - [getattr(x, "calendar", "gregorian") for x in data.flat] + if source is not None: + if loadd is not None: + raise ValueError( + "Can't set the 'source' and 'loadd' parameters " + "at the same time" ) - if len(calendars) > 1: - raise ValueError( - "Not all date-time objects have equivalent " - "calendars: {}".format(tuple(calendars)) - ) - - # If the date-times are calendar-agnostic, assign the - # given calendar, defaulting to Gregorian. - if calendars.pop() == "": - calendar = getattr(self.Units, "calendar", "gregorian") - - new_data = numpy.empty(numpy_shape(data), dtype="O") - for i in numpy_ndindex(new_data.shape): - new_data[i] = cf_dt(data[i], calendar=calendar) - - data = new_data - - # Convert the date-time objects to reference times - data = dt2rt(data, None, units) - - _dtype = data.dtype - if not units.isreftime: + if loads is not None: raise ValueError( - "Can't initialise a reference time array with " - "units {!r}".format(units) + "Can't set the 'source' and 'loads' parameters " + "at the same time" ) - # --- End: if - shape = data.shape - ndim = data.ndim - size = data.size - axes = _initialise_axes(ndim) + if source is not None: + try: + array = source._get_Array(None) + except AttributeError: + array = None - # The _axes attribute is the ordered list of the data array's - # axis names. Each axis name is an arbitrary, unique - # string. DO NOT CHANGE IN PLACE. - self._axes = axes + super().__init__( + source=source, _use_array=_use_array and array is not None + ) - self._ndim = ndim - self._shape = shape - self._size = size + if _use_array: + try: + array = source.get_dask(copy=False) + except (AttributeError, TypeError): + pass + else: + self._set_dask( + array, + copy=copy, + delete_source=False, + reset_mask_hardness=False, + ) + else: + self._del_dask(None) - if dtype is not None: - _dtype = numpy_dtype(dtype) + # Note the mask hardness. It is safe to assume that if a + # dask array has been set, then it's mask hardness will be + # already baked into each chunk. + self._hardmask = getattr(source, "hardmask", _DEFAULT_HARDMASK) - self._dtype = _dtype + return - self._set_partition_matrix( - data, chunk=chunk, check_free_memory=check_free_memory - ) + super().__init__(array=array, fill_value=fill_value, _use_array=False) - # if isinstance(data, CompressedArray): - # self._set_CompressedArray(data, - # axes=axes) - # #if mask is not None: - # # self.where(mask, cf_masked, inplace=True) - ## - # # r#eturn - # else: - # matrix = _xxx.copy() - # - # matrix[()] = Partition(location = [(0, n) for n in shape], - # shape = list(shape), - # axes = axes, - # flip = empty_list, - # Units = units, - # subarray = data, - # part = empty_list) - # - # self.partitions = PartitionMatrix(matrix, empty_list) + # Create the _HDF_chunks attribute: defines HDF chunking when + # writing to disk. # - # if check_free_memory and free_memory() < cf_fm_threshold(): - # self.to_disk() - # - # if chunk: - # self.chunk() - # # --- End: if - - if mask is not None: - self.where(mask, cf_masked, inplace=True) + # Never change the value of the _HDF_chunks attribute + # in-place. + self._HDF_chunks = None - def _set_partition_matrix(self, array, chunk=True, check_free_memory=True): - """Set the array. + if loadd is not None: + self.loadd(loadd) + return - :Parameters: + if loads is not None: + self.loads(loads) + return - array: subclass of `Array` - The array to be inserted. + # Set the units + units = Units(units, calendar=calendar) + self._Units = units - check_free_memory: `bool`, optional - If True then store the data array on disk if there is - is sufficient memory there. + # Note the mask hardness. This only records what we want the + # mask hardness to be, and is required in case this + # initialization does not set an array (i.e. array is None or + # _use_array is False). If a dask array is actually set later + # on, then the mask hardness will be set properly, i.e. it + # will be baked into each chunk. + self._hardmask = hardmask - :Returns: + if array is None: + return - `None` + try: + ndim = array.ndim + except AttributeError: + ndim = np.ndim(array) - **Examples:** + # Create the _cyclic attribute: identifies which axes are + # cyclic (and therefore allow cyclic slicing). It must be a + # subset of the axes given by the _axes attribute. If an axis + # is removed from _axes then it must also be removed from + # _cyclic. + # + # Never change the value of the _cyclic attribute in-place. + self._cyclic = _empty_set - >>> d._set_partition_matrix(array) + # Create the _axes attribute: an ordered sequence of unique + # (within this `Data` instance) names for each array axis. + self._axes = generate_axis_identifiers(ndim) - """ - # if isinstance(array, CompressedArray): - get_compression_type = getattr(array, "get_compression_type", None) - if get_compression_type is not None and get_compression_type(): - # array is compressed - self._set_CompressedArray( - array, check_free_memory=check_free_memory - ) + if not _use_array: return - empty_list = [] - shape = array.shape - - matrix = _xxx.copy() - - matrix[()] = Partition( - location=[(0, n) for n in shape], - shape=list(shape), - axes=self._axes, - flip=empty_list, - Units=self.Units, - subarray=array, - part=empty_list, - ) - - self.partitions = PartitionMatrix(matrix, empty_list) + # Still here? Then create a dask array and store it. - if check_free_memory and free_memory() < cf_fm_threshold(): - self.to_disk() + # Attempt to read data from disk + if to_memory: + try: + array = array.to_memory() + except AttributeError: + pass - if chunk: - self.chunk() + # Find out if the data is compressed + try: + compressed = array.get_compression_type() + except AttributeError: + compressed = "" - source = self.source(None) - if source is not None and source.get_compression_type(): - self._del_Array(None) + if compressed: + if dask_from_array_options: + raise ValueError( + "Can't set 'dask_from_array_options' with " + "compressed input arrays" + ) - def _set_CompressedArray( - self, compressed_array, copy=None, check_free_memory=True - ): - """Create and insert a partition matrix for a compressed array. + # Save the input compressed array, as this will contain + # extra information, such as a count or index variable. + self._set_Array(array) + + array = compressed_to_dask(array, chunks) + elif not is_dask_collection(array): + # Turn the array into a dask array + array = to_dask(array, chunks, dask_from_array_options) + + # Find out if we have an array of date-time objects + first_value = None + if not dt and array.dtype.kind == "O": + first_value = first_non_missing_value(array) + if first_value is not None: + dt = hasattr(first_value, "timetuple") + + # Convert string or object date-times to floating point + # reference times, if appropriate. + if array.dtype.kind in "USO" and (dt or units.isreftime): + array, units = convert_to_reftime(array, units, first_value) + # Reset the units + self._Units = units - .. versionadded:: 3.0.6 + # Store the dask array + self._set_dask(array, delete_source=False, reset_mask_hardness=False) - .. seealso:: `_set_Array`, `_set_partition_matrix`, `compress` + # Set the mask hardness on each chunk. + self.hardmask = hardmask - :Parameters: + # Override the data type + if dtype is not None: + self.dtype = dtype - compressed_array: subclass of `CompressedArray` + # Apply a mask + if mask is not None: + self.where(mask, cf_masked, inplace=True) - copy: optional - Ignored. + # @property# + # def dask_array(s#elf): + # """TODODASK.## + # + # :Returns: + # + # `dask.array.Array`## + # + # """ + # return self.get_dask(copy=True) - check_free_memory: `bool`, optional - If True then store the data array on disk if there is - is sufficient memory there. + @property + def dask_compressed_array(self): + """TODODASK. :Returns: - `None` + `dask.array.Array` """ - if check_free_memory and free_memory() < cf_fm_threshold(): - compressed_array.to_disk() - - new = type(self).empty( - shape=compressed_array.shape, units=self.Units, chunk=False - ) - new._axes = self._axes - - source_data = compressed_array.source() - compression_type = compressed_array.get_compression_type() - - if compression_type == "ragged contiguous": - # -------------------------------------------------------- - # Ragged contiguous - # -------------------------------------------------------- - new.chunk(total=[0], omit_axes=[1]) - - count = compressed_array.get_count().array - - start = 0 - for n, partition in zip(count, new.partitions.flat): - end = start + n - - partition.subarray = RaggedContiguousSubarray( - array=source_data, - shape=partition.shape, - compression={ - "instance_axis": 0, - "instance_index": 0, - "c_element_axis": 1, - "c_element_indices": slice(start, end), - }, - ) - partition.part = [] - - start += n - - elif compression_type == "ragged indexed": - # -------------------------------------------------------- - # Ragged indexed - # -------------------------------------------------------- - new.chunk(total=[0], omit_axes=[1]) - - index = compressed_array.get_index().array - - (instances, inverse) = numpy.unique(index, return_inverse=True) - - for i, partition in zip( - numpy.unique(inverse), new.partitions.flat - ): - partition.subarray = RaggedIndexedSubarray( - array=source_data, - shape=partition.shape, - compression={ - "instance_axis": 0, - "instance_index": 0, - "i_element_axis": 1, - "i_element_indices": numpy_where(inverse == i)[0], - }, - ) - partition.part = [] - - elif compression_type == "ragged indexed contiguous": - # -------------------------------------------------------- - # Ragged indexed contiguous - # -------------------------------------------------------- - new.chunk(total=[0, 1], omit_axes=[2]) - - index = compressed_array.get_index().array - count = compressed_array.get_count().array - - (instances, inverse) = numpy.unique(index, return_inverse=True) - - new_partitions = new.partitions.matrix - - shape = compressed_array.shape - - for i in range(shape[0]): - # For all of the profiles in ths instance, find the - # locations in the count array of the number of - # elements in the profile - xprofile_indices = numpy.where(index == i)[0] - - # Find the number of profiles in this instance - n_profiles = xprofile_indices.size - - # Loop over profiles in this instance - for j in range(shape[1]): - partition = new_partitions[i, j] - - if j >= n_profiles: - # This partition is full of missing data - subarray = FilledArray( - shape=partition.shape, - size=partition.size, - ndim=partition.ndim, - dtype=compressed_array.dtype, - fill_value=cf_masked, - ) - else: - # Find the location in the count array of the number - # of elements in this profile - profile_index = xprofile_indices[j] - - if profile_index == 0: - start = 0 - else: - start = int(count[:profile_index].sum()) - - stop = start + int(count[profile_index]) - - subarray = RaggedIndexedContiguousSubarray( - array=source_data, - shape=partition.shape, - compression={ - "instance_axis": 0, - "instance_index": 0, - "i_element_axis": 1, - "i_element_index": 0, - "c_element_axis": 2, - "c_element_indices": slice(start, stop), - }, - ) - # --- End: if - - partition.subarray = subarray - partition.part = [] - # --- End: for - # --- End: for - - elif compression_type == "gathered": - # -------------------------------------------------------- - # Gathered - # -------------------------------------------------------- - compressed_dimension = compressed_array.get_compressed_dimension() - compressed_axes = compressed_array.get_compressed_axes() - indices = compressed_array.get_list().array - - new.chunk(omit_axes=compressed_axes) # , total=[0]) - - for partition in new.partitions.flat: - partition_indices = partition.indices - compressed_part = [ - partition_indices[i] - for i in range(new.ndim) - if i not in compressed_axes - ] - compressed_part.insert(compressed_dimension, slice(None)) - - partition.subarray = GatheredSubarray( - array=source_data, - shape=partition.shape, - compression={ - "compressed_dimension": compressed_dimension, - "compressed_axes": compressed_axes, - "compressed_part": compressed_part, - "indices": indices, - }, - ) - - partition.part = [] - # --- End: if + ca = self.source(None) - self.partitions = new.partitions + if ca is None or not ca.get_compression_type(): + raise ValueError("not compressed: can't get compressed dask array") - self._set_Array(compressed_array, copy=False) + return ca.get_dask(copy=False).copy() + @daskified(daskified_log_level) def __contains__(self, value): """Membership test operator ``in`` @@ -1038,14 +677,19 @@ def __contains__(self, value): Returns True if the value is contained anywhere in the data array. The value may be a `cf.Data` object. + **Performance** + + All delayed operations are exectued, and there is no + short-circuit once the first occurrence is found. + **Examples:** - >>> d = Data([[0.0, 1, 2], [3, 4, 5]], 'm') + >>> d = cf.Data([[0.0, 1, 2], [3, 4, 5]], 'm') >>> 4 in d True - >>> Data(3) in d + >>> cf.Data(3) in d True - >>> Data([2.5], units='2 m') in d + >>> cf.Data([2.5], units='2 m') in d True >>> [[2]] in d True @@ -1055,7 +699,12 @@ def __contains__(self, value): False """ - if isinstance(value, self.__class__): + + def contains_chunk(a, value): + out = value in a + return np.array(out).reshape((1,) * a.ndim) + + if isinstance(value, self.__class__): # TODDASK chek aother type stoo self_units = self.Units value_units = value.Units if value_units.equivalent(self_units): @@ -1065,20 +714,23 @@ def __contains__(self, value): elif value_units: return False - value = value.array + value = value.get_dask(copy=False) - config = self.partition_configuration(readonly=True) + dx = self.get_dask(copy=False) - for partition in self.partitions.matrix.flat: - partition.open(config) - array = partition.array - partition.close() + out_ind = tuple(range(dx.ndim)) + dx_ind = out_ind - if value in array: - return True - # --- End: for + dx = da.blockwise( + partial(contains_chunk, value=value), + out_ind, + dx, + dx_ind, + adjust_chunks={i: 1 for i in out_ind}, + dtype=bool, + ) - return False + return bool(dx.any()) @property def _atol(self): @@ -1104,266 +756,20 @@ def _is_abstract_Array_subclass(self, array): """ return isinstance(array, cfdm.Array) - def _auxiliary_mask_from_1d_indices(self, compressed_indices): - """Returns the auxiliary masks corresponding to given indices. - - :Parameters: + def __data__(self): + """Returns a new reference to self.""" + return self - compressed_indices: + def __hash__(self): + """The built-in function `hash` - :Returns: + Generating the hash temporarily realizes the entire array in + memory, which may not be possible for large arrays. - `list` of `Data` - The auxiliary masks in a list. - - """ - auxiliary_mask = [] - - for i, (compressed_index, size) in enumerate( - zip(compressed_indices, self._shape) - ): - - if isinstance( - compressed_index, slice - ) and compressed_index.step in (-1, 1): - # Compressed index is a slice object with a step of - # +-1 => no auxiliary mask required for this axis - continue - - index = numpy_zeros(size, dtype=bool) - index[compressed_index] = True - - compressed_size = index.sum() - - ind = numpy_where(index) - - ind0 = ind[0] - start = ind0[0] - envelope_size = ind0[-1] - start + 1 - - if 0 < compressed_size < envelope_size: - jj = [None] * self._ndim - jj[i] = envelope_size - - if start: - ind0 -= start - - mask = self._auxiliary_mask_component(jj, ind, True) - auxiliary_mask.append(mask) - # --- End: for - - return auxiliary_mask - - def _auxiliary_mask_return(self): - """Return the auxiliary mask. - - :Returns: - - `Data` or `None` - The auxiliary mask, or `None` if there isn't one. - - **Examples:** - - >>> m = d._auxiliary_mask_return() - - """ - _auxiliary_mask = self._auxiliary_mask - if not _auxiliary_mask: - shape = getattr(self, "shape", None) - if shape is not None: - return type(self).full(shape, fill_value=False, dtype=bool) - else: - return None - # --- End: if - - mask = _auxiliary_mask[0] - for m in _auxiliary_mask[1:]: - mask = mask | m - - return mask - - def _auxiliary_mask_add_component(self, mask): - """Add a new auxiliary mask. - - :Parameters: - - mask: `cf.Data` or `None` - - :Returns: - - `None` - - **Examples:** - - >>> d._auxiliary_mask_add_component(m) - - """ - if mask is None: - return - - # Check that this mask component has the correct number of - # dimensions - if mask.ndim != self._ndim: - raise ValueError( - "Auxiliary mask must have same number of axes as the data " - "array ({}!={})".format(mask.ndim, self.ndim) - ) - - # Check that this mask component has an appropriate shape - mask_shape = mask.shape - for i, j in zip(mask_shape, self._shape): - if not (i == j or i == 1): - raise ValueError( - "Auxiliary mask shape {} is not broadcastable to data " - "array shape {}".format(mask.shape, self._shape) - ) - - # Merge this mask component with another, if possible. - append = True - if self._auxiliary_mask is not None: - for m0 in self._auxiliary_mask: - if m0.shape == mask_shape: - # Merging the new mask with an existing auxiliary - # mask component - m0 |= mask - append = False - break - # --- End: if - - if append: - mask = mask.copy() - - # Make sure that the same axes are cyclic for the data - # array and the auxiliary mask - indices = [self._axes.index(axis) for axis in self._cyclic] - mask._cyclic = set([mask._axes[i] for i in indices]) - - if self._auxiliary_mask is None: - self._auxiliary_mask = [mask] - else: - self._auxiliary_mask.append(mask) - - def _auxiliary_mask_subspace(self, indices): - """Subspace the new auxiliary mask. - - :Returns: - - `None` - - **Examples:** - - >>> d._auxiliary_mask_subspace((slice(0, 9, 2))) - - """ - if not self._auxiliary_mask: - # There isn't an auxiliary mask - return - - new = [] - for mask in self._auxiliary_mask: - mask_indices = [ - (slice(None) if n == 1 else index) - for n, index in zip(mask.shape, indices) - ] - new.append(mask[tuple(mask_indices)]) - - self._auxiliary_mask = new - - def _create_auxiliary_mask_component(self, mask_shape, ind, compress): - """Create a new auxiliary mask component of given shape. - - :Parameters: - - mask_shape: `tuple` - The shape of the mask component to be created. This will - contain `None` for axes not spanned by the *ind* - parameter. - - *Parameter example* - ``mask_shape=(3, 11, None)`` - - ind: `numpy.ndarray` - As returned by a single argument call of - ``numpy.array(numpy[.ma].where(....))``. - - compress: `bool` - If True then remove whole slices which only contain masked - points. - - :Returns: - - `Data` - - """ - # -------------------------------------------------------- - # Find the shape spanned by ind - # -------------------------------------------------------- - shape = [n for n in mask_shape if n] - - # Note that, for now, auxiliary_mask has to be numpy array - # (rather than a cf.Data object) because we're going to index - # it with fancy indexing which a cf.Data object might not - # support - namely a non-monotonic list of integers. - auxiliary_mask = numpy_ones(shape, dtype=bool) - - auxiliary_mask[tuple(ind)] = False - - if compress: - # For compressed indices, remove slices which only - # contain masked points. (Note that we only expect to - # be here if there were N-d item criteria.) - for iaxis, (index, n) in enumerate(zip(ind, shape)): - index = set(index) - if len(index) < n: - auxiliary_mask = auxiliary_mask.take( - sorted(index), axis=iaxis - ) - # --- End: if - - # Add missing size 1 axes to the auxiliary mask - if auxiliary_mask.ndim < self.ndim: - i = [(slice(None) if n else numpy_newaxis) for n in mask_shape] - auxiliary_mask = auxiliary_mask[tuple(i)] - - return type(self)(auxiliary_mask) - - def _auxiliary_mask_tidy(self): - """Remove unnecessary auxiliary mask components. - - :Returns: - - `None` - - **Examples:** - - >>> d._auxiliary_mask_tidy() - - """ - auxiliary_mask = self._auxiliary_mask - if auxiliary_mask: - # Get rid of auxiliary mask components which are all False - auxiliary_mask = [m for m in auxiliary_mask if m.any()] - if not auxiliary_mask: - auxiliary_mask = None - else: - auxiliary_mask = None - - self._auxiliary_mask = auxiliary_mask - - def __data__(self): - """Returns a new reference to self.""" - return self - - def __hash__(self): - """The built-in function `hash` - - Generating the hash temporarily realizes the entire array in - memory, which may not be possible for large arrays. - - The hash value is dependent on the data-type and shape of the data - array. If the array is a masked array then the hash value is - independent of the fill value and of data array values underlying - any masked elements. + The hash value is dependent on the data-type and shape of the data + array. If the array is a masked array then the hash value is + independent of the fill value and of data array values underlying + any masked elements. The hash value may be different if regenerated after the data array has been changed in place. @@ -1554,6 +960,7 @@ def __repr__(self): """ return super().__repr__().replace("<", " 1: + raise NotImplementedError( + "Currently limited to at most one dimension's assignment " + "index being a 1-d array of integers or booleans. " + f"Got: {indices}" + ) + # TODODASK: The inherited algorithm that does assignment + # for multiple list/1-d array indices + # (cfdm.Data._set_subspace) won't work when the + # 1-d array is a dask array because it may need + # to be computed at __setitem__ runtime, which + # is not desirable. Until this can be fixed, + # it's easiest to disallow this case, that was + # allowed pre-dask. + + # Roll axes with cyclic slices + if roll: + # For example, if assigning to slice(-2, 3) has been + # requested on a cyclic axis (and we're not using numpy + # indexing), then we roll that axis by two points and + # assign to slice(0, 5) instead. The axis is then unrolled + # by two points afer the assignment has been made. + axes = self._axes + if not self._cyclic.issuperset([axes[i] for i in roll]): + raise IndexError( + "Can't do a cyclic assignment to a non-cyclic axis" + ) - The step of the input slice must have a step of `. + roll_axes = tuple(roll.keys()) + shifts = tuple(roll.values()) + self.roll(shift=shifts, axis=roll_axes, inplace=True) - :Parameters: + # Make sure that the units of value are the same as self + value = conform_units(value, self.Units) - index: `slice` - A slice object with a step of 1. + # Do the assignment + dx = self.get_dask(copy=False) + dx[indices] = dask_compatible(value) - size: `int` + # Unroll any axes that were rolled to enable a cyclic + # assignment + if roll: + shifts = [-shift for shift in shifts] + self.roll(shift=shifts, axis=roll_axes, inplace=True) - :Returns: + # Reset the mask hardness, otherwise it could be incorrect in + # the case that a chunk that was not a masked array is + # assigned missing values. + self._reset_mask_hardness() - `slice` + # Remove a source array, on the grounds that we can't + # guarantee its consistency with the updated dask array. + self._del_Array(None) - **Examples:** + return - >>> s = slice(2, 6) - >>> t = _mirror_slice(s, 8) - >>> s, t - slice(2, 6), slice(5, 1, -1) - >>> range(8)[s] - [2, 3, 4, 5] - >>> range(8)[t] - [5, 4, 3, 2] - >>> range(7, -1, -1)[s] - [5, 4, 3, 2] - >>> range(7, -1, -1)[t] - [2, 3, 4, 5] + # ---------------------------------------------------------------- + # Indexing behaviour attributes + # ---------------------------------------------------------------- + @property + @daskified(daskified_log_level) + def __orthogonal_indexing__(self): + """Flag to indicate that orthogonal indexing is supported. - """ - start, stop, step = index.indices(size) - size -= 1 - start = size - start - stop = size - stop - if stop < 0: - stop = None + Always True, indicating that 'orthogonal indexing' is + applied. This means that when indices are 1-d arrays or lists + then they subspace along each dimension independently. This + behaviour is similar to Fortran, but different to `numpy`. - return slice(start, stop, -1) + .. versionadded:: TODODASK - # --- End: def + .. seealso:: `__keepdims_indexing__`, `__getitem__`, + `__setitem__`, + `netCDF4.Variable.__orthogonal_indexing__` - config = self.partition_configuration(readonly=False) + **Examples** - # ------------------------------------------------------------ - # parse the indices - # ------------------------------------------------------------ - indices_in = indices - indices, roll, flip_axes, mask = parse_indices( - self._shape, indices_in, cyclic=True, reverse=True, mask=True - ) + >>> d = cf.Data([[1, 2, 3], + ... [4, 5, 6]]) + >>> e = d[[0], [0, 2]] + >>> e.shape + (1, 2) + >>> print(e.array) + [[1 3]] + >>> e = d[[0, 1], [0, 2]] + >>> e.shape + (2, 2) + >>> print(e.array) + [[1 3] + [4 6]] - if roll: - for iaxis, shift in roll.items(): - self.roll(iaxis, shift, inplace=True) - # --- End: if + """ + return True - if mask: - original_self = self.copy() + @property + @daskified(daskified_log_level) + def __keepdims_indexing__(self): + """Flag to indicate whether dimensions indexed with integers are + kept. - scalar_value = False - if value is cf_masked: - scalar_value = True - else: - if not isinstance(value, Data): - # Convert to the value to a Data object - value = type(self)(value, self.Units) - else: - if value.Units.equivalent(self.Units): - if not value.Units.equals(self.Units): - value = value.copy() - value.Units = self.Units - elif not value.Units: - value = value.override_units(self.Units) - else: - raise ValueError( - "Can't assign values with units {!r} to data with " - "units {!r}".format(value.Units, self.Units) - ) - # --- End: if + If set to True (the default) then providing a single integer + as a single-axis index does *not* reduce the number of array + dimensions by 1. This behaviour is different to `numpy`. - if value._size == 1: - scalar_value = True - value = value.datum(0) - # --- End: if + If set to False then providing a single integer as a + single-axis index reduces the number of array dimensions by + 1. This behaviour is the same as `numpy`. - source = self.source(None) - if source is not None and source.get_compression_type(): - self._del_Array(None) + .. versionadded:: TODODASK - if scalar_value: - # -------------------------------------------------------- - # The value is logically scalar - # -------------------------------------------------------- - for partition in self.partitions.matrix.flat: - p_indices, shape = partition.overlaps(indices) - if p_indices is None: - # This partition does not overlap the indices - continue + .. seealso:: `__orthogonal_indexing__`, `__getitem__`, + `__setitem__` - partition.open(config) - array = partition.array + **Examples** - if value is cf_masked and not partition.masked: - # The assignment is masking elements, so turn a - # non-masked sub-array into a masked one. - array = array.view(numpy_ma_MaskedArray) - partition.subarray = array + >>> d = cf.Data([[1, 2, 3], + ... [4, 5, 6]]) + >>> d.__keepdims_indexing__ + True + >>> e = d[0] + >>> e.shape + (1, 3) + >>> print(e.array) + [[1 2 3]] - self._set_subspace(array, p_indices, value) - partition.close() + >>> d.__keepdims_indexing__ + True + >>> e = d[:, 1] + >>> e.shape + (2, 1) + >>> print(e.array) + [[2] + [5]] - if roll: - for iaxis, shift in roll.items(): - self.roll(iaxis, -shift, inplace=True) - # --- End: if + >>> d.__keepdims_indexing__ + True + >>> e = d[0, 1] + >>> e.shape + (1, 1) + >>> print(e.array) + [[2]] - if mask: - indices = tuple(indices) - original_self = original_self[indices] - u = self[indices] - for m in mask: - u.where(m, original_self, inplace=True) + >>> d.__keepdims_indexing__ = False + >>> e = d[0] + >>> e.shape + (3,) + >>> print(e.array) + [1 2 3] - self[indices] = u - # --- End: if + >>> d.__keepdims_indexing__ + False + >>> e = d[:, 1] + >>> e.shape + (2,) + >>> print(e.array) + [2 5] - return + >>> d.__keepdims_indexing__ + False + >>> e = d[0, 1] + >>> e.shape + () + >>> print(e.array) + 2 - # ------------------------------------------------------------ - # Still here? Then the value is not logically scalar. - # ------------------------------------------------------------ - data0_shape = self._shape - value_shape = value._shape - - shape00 = list(map(_size_of_index, indices, data0_shape)) - shape0 = shape00[:] - - self_ndim = self._ndim - value_ndim = value._ndim - align_offset = self_ndim - value_ndim - if align_offset >= 0: - # self has more dimensions than other - shape0 = shape0[align_offset:] - shape1 = value_shape - ellipsis = None - - flip_axes = [ - i - align_offset for i in flip_axes if i >= align_offset - ] - else: - # value has more dimensions than self - v_align_offset = -align_offset - if value_shape[:v_align_offset] != [1] * v_align_offset: - # Can only allow value to have more dimensions then - # self if the extra dimensions all have size 1. - raise ValueError( - "Can't broadcast shape %r across shape %r" - % (value_shape, data0_shape) - ) + """ + return self._custom.get("__keepdims_indexing__", True) - shape1 = value_shape[v_align_offset:] - ellipsis = Ellipsis - align_offset = 0 + @__keepdims_indexing__.setter + def __keepdims_indexing__(self, value): + self._custom["__keepdims_indexing__"] = bool(value) - # Find out which of the dimensions of value are to be - # broadcast, and those which are not. Note that, as in numpy, - # it is not allowed for a dimension in value to be larger than - # a size 1 dimension in self - base_value_indices = [] - non_broadcast_dimensions = [] - - for i, (a, b) in enumerate(zip(shape0, shape1)): - if b == 1: - base_value_indices.append(slice(None)) - elif a == b and b != 1: - base_value_indices.append(None) - non_broadcast_dimensions.append(i) - else: - raise ValueError( - "Can't broadcast data with shape {!r} across " - "shape {!r}".format(shape1, tuple(shape00)) - ) - # --- End: for + def get_dask(self, copy=True): + """Get the underlying dask array. - previous_location = ((-1,),) * self_ndim - start = [0] * value_ndim + .. versionadded:: TODODASK - # save = pda_args['save'] - # keep_in_memory = pda_args['keep_in_memory'] + :Parameters: - value.to_memory() + copy: `bool`, optional + If False then return the actual dask array. By default + a copy is returned. - for partition in self.partitions.matrix.flat: - p_indices, shape = partition.overlaps(indices) + :Returns: - if p_indices is None: - # This partition does not overlap the indices - continue + `dask.array.Array` + The dask array. - # -------------------------------------------------------- - # Find which elements of value apply to this partition - # -------------------------------------------------------- - value_indices = base_value_indices[:] + """ + da = self._custom["dask"] + if copy: + da = da.copy() - for i in non_broadcast_dimensions: - j = i + align_offset - location = partition.location[j][0] - reference_location = previous_location[j][0] + return da - if location > reference_location: - stop = start[i] + shape[j] - value_indices[i] = slice(start[i], stop) - start[i] = stop + def _set_dask( + self, array, copy=False, delete_source=True, reset_mask_hardness=True + ): + """Set the dask array. - elif location == reference_location: - value_indices[i] = previous_slice[i] # noqa F821 + .. versionadded:: TODODASK - elif location < reference_location: - stop = shape[j] - value_indices[i] = slice(0, stop) - start[i] = stop - # --- End: for + .. seealso:: `_del_dask`, `get_dask`, `_reset_mask_hardness` - previous_location = partition.location - previous_slice = value_indices[:] # noqa F821 + :Parameters: - for i in flip_axes: - value_indices[i] = _mirror_slice(value_indices[i], shape1[i]) + array: `dask.array.Array` + The array to be inserted. - if ellipsis: - value_indices.insert(0, ellipsis) + copy: `bool`, optional + If True then copy *array* before setting it. By + default it is not copied. - # -------------------------------------------------------- - # - # -------------------------------------------------------- - v = value[tuple(value_indices)].varray + delete_source: `bool`, optional + If False then do not delete a source array, if one + exists, after setting the new dask array. By default a + source array is deleted. - # if keep_in_memory: #not save: - # v = v.copy() + reset_mask_hardness: `bool`, optional + If False then do not reset the mask hardness after + setting the new dask array. By default the mask + hardness is re-applied, even if the mask hardness has + not changed. - # Update the partition's data - partition.open(config) - array = partition.array + :Returns: - if not partition.masked and numpy_ma_isMA(v): - # The sub-array is not masked and the assignment is - # masking elements, so turn the non-masked sub-array - # into a masked one. - array = array.view(numpy_ma_MaskedArray) - partition.subarray = array + `None` - self._set_subspace(array, p_indices, v) + """ + if array is NotImplemented: + logger.warning( + "NotImplemented has been set in the place of a dask array" + ) + # This could occur if any sort of exception is raised by + # function that is run on chunks (such as + # `cf_where`). Such a function could get run at definition + # time in order to ascertain suitability (such as data + # type casting, broadcasting, etc.). Note that the + # exception may be difficult to diagnose, as dask will + # have silently trapped it and returned NotImplemented + # (for instance, see `dask.array.core.elemwise`). Print + # statements in a local copy of dask are prossibly the way + # to go if the cause of the error is not obvious. - partition.close() - # --- End: For + if copy: + array = array.copy() - if roll: - # Unroll - for iaxis, shift in roll.items(): - self.roll(iaxis, -shift, inplace=True) - # --- End: if + self._custom["dask"] = array - if mask: - indices = tuple(indices) - original_self = original_self[indices] - u = self[indices] - for m in mask: - u.where(m, original_self, inplace=True) + if delete_source: + # Remove a source array, on the grounds that we can't + # guarantee its consistency with the new dask array. + self._del_Array(None) - self[indices] = u + if reset_mask_hardness: + self._reset_mask_hardness() - def _flag_partitions_for_processing(self, parallelise=True): - """TODO.""" - if mpi_on and parallelise: - # Add a flag `_process_partition` to each partition defining - # whether this partition will be processed on this process - n_partitions = self.partitions.size - x = n_partitions // mpi_size - if n_partitions < mpi_size: - for i, partition in enumerate(self.partitions.matrix.flat): - if i == mpi_rank: - partition._process_partition = True - else: - partition._process_partition = False - # --- End: if - # --- End: for - self._max_partitions_per_process = 1 - elif n_partitions % mpi_size == 0: - for i, partition in enumerate(self.partitions.matrix.flat): - if i // x == mpi_rank: - partition._process_partition = True - else: - partition._process_partition = False - # --- End: if - # --- End: for - self._max_partitions_per_process = x - else: - for i, partition in enumerate(self.partitions.matrix.flat): - if i // (x + 1) == mpi_rank: - partition._process_partition = True - else: - partition._process_partition = False - # --- End: if - # --- End: for - self._max_partitions_per_process = x + 1 - # --- End: if - else: - # Flag all partitions for processing on all processes - for partition in self.partitions.matrix.flat: - partition._process_partition = True - # --- End: if + def _del_dask(self, default=ValueError(), delete_source=True): + """Remove the dask array. - def _share_lock_files(self, parallelise): - """Share the lock files created by each rank for each - partition.""" - if parallelise: - # Only gather the lock files if the subarrays have been - # gathered between procesors, otherwise this will result - # in incorrect handling of the removal of temporary files - for partition in self.partitions.matrix.flat: - if partition.in_cached_file: - # The subarray is in a temporary file - lock_file = partition._register_temporary_file() - lock_files = mpi_comm.allgather(lock_file) - partition._update_lock_files(lock_files) - # --- End: if - # --- End: for - # --- End: if + .. versionadded:: TODODASK - @classmethod - def _share_partitions(cls, processed_partitions, parallelise): - """Share the partitions processed on each rank with every other - rank.""" - # Share the partitions processed on each rank with every other - # rank. If parallelise is False then there is nothing to be done - if parallelise: - # List to contain sublists of processed partitions from each - # rank - partition_list = [] - - for rank in range(mpi_size): - # Get the numper of processed partitions on each rank - if mpi_rank == rank: - n_partitions = len(processed_partitions) - else: - n_partitions = None - # --- End: if - n_partitions = mpi_comm.bcast(n_partitions, root=rank) + .. seealso:: `_set_dask`, `get_dask` - # Share each of the processed partitions on each rank with - # all the other ranks using broadcasting - if mpi_rank != rank: - shared_partitions = [] - # --- End: if + :Parameters: - for i in range(n_partitions): - if mpi_rank == rank: - partition = processed_partitions[i] - if isinstance( - partition._subarray, numpy_ndarray - ) and partition._subarray.dtype.kind in { - "b", - "i", - "u", - "f", - "c", - }: - # If the subarray is a supported numpy - # array, swap it out before broadcasting - # the partition - subarray = partition._subarray - partition._subarray = None - partition._subarray_removed = True - partition._subarray_dtype = subarray.dtype - partition._subarray_shape = subarray.shape - partition._subarray_isMA = numpy_ma_isMA(subarray) - if partition._subarray_isMA: - partition._subarray_is_masked = ( - subarray.mask is not numpy_ma_nomask - ) - else: - partition._subarray_is_masked = False - # --- End: if - else: - # The partition's subarray is either not a - # numpy array or is, for example, an array - # of strings, so it will be pickled and - # broadcast with the partition. - partition._subarray_removed = False - # --- End: if - else: - partition = None - # --- End: if + default: optional + Return the value of the *default* parameter if the + dask array axes has not been set. - # Pickle and broadcast the partition with or - # without the subarray - partition = mpi_comm.bcast(partition, root=rank) - - if partition._subarray_removed: - # If the subarray is a supported numpy array - # broadcast it without pickling and swap it - # back into the partition - if partition._subarray_isMA: - # If the subarray is a masked array broadcast - # the data and the mask separately - if mpi_rank != rank: - if partition._subarray_is_masked: - subarray = numpy_ma_masked_all( - partition._subarray_shape, - dtype=partition._subarray_dtype, - ) - else: - subarray = numpy_ma_empty( - partition._subarray_shape, - dtype=partition._subarray_dtype, - ) - # --- End: if - # --- End: if - mpi_comm.Bcast(subarray.data, root=rank) - if partition._subarray_is_masked: - mpi_comm.Bcast(subarray.mask, root=rank) - # --- End: if - else: - if mpi_rank != rank: - subarray = numpy_empty( - partition._subarray_shape, - dtype=partition._subarray_dtype, - ) - # --- End: if - mpi_comm.Bcast(subarray, root=rank) - # --- End: if - - # Swap the subarray back into the partition - partition._subarray = subarray - if mpi_rank == rank: - # The result of broadcasting an object is - # different to the original object even on the - # root PE, so the new partition with the numpy - # subarray must be put back in the list of - # processed partitions - processed_partitions[i] = partition - # --- End: if - - # Clean up temporary attributes - del partition._subarray_dtype - del partition._subarray_shape - del partition._subarray_isMA - del partition._subarray_is_masked - elif mpi_rank == rank: - # Remove the subarray from partition so that - # when it is deleted it does not delete the - # temporary file - partition._subarray = None - # --- End: if + {{default Exception}} - # Clean up temporary attribute - del partition._subarray_removed + delete_source: `bool`, optional + If False then do not delete a compressed source array, + if one exists. - if mpi_rank != rank: - shared_partitions.append(partition) - # --- End: if - # --- End: for + :Returns: - # Add the sublist of processed partitions from each rank - # to a list - if mpi_rank == rank: - partition_list.append(processed_partitions) - else: - partition_list.append(shared_partitions) - # --- End: if - # --- End: for + `dask.array.Array` + The removed dask array. - # Flatten the list of lists of processed partitions - processed_partitions = [ - item for sublist in partition_list for item in sublist - ] - # --- End: if - return processed_partitions + **Examples** + + >>> d = cf.Data([1, 2, 3]) + >>> dx = d._del_dask() + >>> d._del_dask("No dask array") + 'No dask array' + >>> d._del_dask() + Traceback (most recent call last): + ... + ValueError: 'Data' has no dask array + >>> d._del_dask(RuntimeError('No dask array')) + Traceback (most recent call last): + ... + RuntimeError: No dask array + + """ + try: + out = self._custom.pop("dask") + except KeyError: + return self._default( + default, f"{self.__class__.__name__!r} has no dask array" + ) + + if delete_source: + # Remove a source array, on the grounds that we can't + # guarantee its consistency with any future new dask + # array. + self._del_Array(None) + + return out + + def _map_blocks( + self, func, delete_source=True, reset_mask_hardness=True, **kwargs + ): + """Apply a function to the data in-place. + + .. versionadded:: TODODASK + + :Parameters: + + func: + The function to be applied to the data, via + `dask.array.map_blocks`, to each chunk of the dask + array. + + delete_source: `bool`, optional + If False then do not delete a source array, if one + exists, after applying the function. By default a + source array is deleted. + + reset_mask_hardness: `bool`, optional + If False then do not reset the mask hardness after + applying the function. By default the mask hardness is + re-applied, even if the mask hardness has not changed. + + kwargs: optional + Keyword arguments passed to the + `dask.array.map_blocks` method. + + :Returns: + + `dask.array.Array` + The updated dask array. + + **Examples** + + >>> d = cf.Data([1, 2, 3]) + >>> dx = d._map_blocks(lambda x: x / 2) + >>> print(d.array) + [0.5 1. 1.5] + + """ + dx = self.get_dask(copy=False) + dx = dx.map_blocks(func, **kwargs) + self._set_dask( + dx, + delete_source=delete_source, + reset_mask_hardness=reset_mask_hardness, + ) + + return dx + + def _reset_mask_hardness(self): + """Re-apply the mask hardness to the dask array. + + .. versionadded:: TODODASK + + .. seealso:: `hardmask`, `harden_mask`, `soften_mask` + + :Returns: + + `None` + + """ + self.hardmask = self.hardmask @_inplace_enabled(default=False) def diff(self, axis=-1, n=1, inplace=False): @@ -2313,7 +1659,7 @@ def diff(self, axis=-1, n=1, inplace=False): # Diff each section for key, data in sections.items(): - output_array = numpy_diff(data.array, axis=axis) + output_array = np.diff(data.array, axis=axis) sections[key] = type(self)( output_array, units=self.Units, fill_value=self.fill_value @@ -2351,7 +1697,7 @@ def dumps(self): p["units"] = str(p.pop("Units")) # --- End: for - return json_dumps(d, default=_convert_to_builtin_type) + return json_dumps(d, default=convert_to_builtin_type) def digitize( self, @@ -2527,7 +1873,7 @@ def digitize( else: bin_units = org_units - bins = numpy_asanyarray(bins) + bins = np.asanyarray(bins) if bins.ndim > 2: raise ValueError( @@ -2562,7 +1908,7 @@ def digitize( # --- End: for two_d_bins = bins - bins = numpy_unique(bins) + bins = np.unique(bins) # Find the bins that were omitted from the original 2-d # bins array. Note that this includes the left-open and @@ -2599,7 +1945,7 @@ def digitize( mx = self.max().datum() mn = self.min().datum() - bins = numpy_linspace(mn, mx, int(bins) + 1, dtype=float) + bins = np.linspace(mn, mx, int(bins) + 1, dtype=float) delete_bins = [] @@ -2612,7 +1958,7 @@ def digitize( bins = bins.astype(float, copy=True) - epsilon = numpy_finfo(float).eps + epsilon = np.finfo(float).eps ndim = bins.ndim if upper: mn = bins[(0,) * ndim] @@ -2627,7 +1973,7 @@ def digitize( delete_bins.append(bins.size) if return_bins and two_d_bins is None: - x = numpy_empty((bins.size - 1, 2), dtype=bins.dtype) + x = np.empty((bins.size - 1, 2), dtype=bins.dtype) x[:, 0] = bins[:-1] x[:, 1] = bins[1:] two_d_bins = x @@ -2639,20 +1985,20 @@ def digitize( array = partition.array mask = None - if numpy_ma_isMA(array): + if np.ma.isMA(array): mask = array.mask.copy() - array = numpy_digitize(array, bins, right=upper) + array = np.digitize(array, bins, right=upper) if delete_bins: for n, d in enumerate(delete_bins): d -= n - array = numpy_ma_where(array == d, numpy_ma_masked, array) - array = numpy_ma_where(array > d, array - 1, array) + array = np.ma.where(array == d, np.ma.masked, array) + array = np.ma.where(array > d, array - 1, array) # --- End: if if mask is not None: - array = numpy_ma_where(mask, numpy_ma_masked, array) + array = np.ma.where(mask, np.ma.masked, array) partition.subarray = array partition.Units = _units_None @@ -2893,7 +2239,7 @@ def percentile( [2 2 3 3]] """ - ranks = numpy_array(ranks).flatten() + ranks = np.array(ranks).flatten() ranks.sort() if ranks[0] < 0 or ranks[-1] > 100: @@ -2914,8 +2260,7 @@ def percentile( # If the input data array 'fits' in one chunk of memory, then # make sure that it has only one partition if ( - not mpi_on - and not _preserve_partitions + not _preserve_partitions and self._pmndim and self.fits_in_one_chunk_in_memory(self.dtype.itemsize) ): @@ -2928,14 +2273,14 @@ def percentile( for key, data in sections.items(): array = data.array - masked = numpy_ma_is_masked(array) + masked = np.ma.is_masked(array) if masked: if array.dtype != _dtype_float: # Can't assign NaNs to integer arrays array = array.astype(float, copy=True) - array = numpy_ma_filled(array, numpy_nan) - func = numpy_nanpercentile + array = np.ma.filled(array, np.nan) + func = np.nanpercentile with numpy_testing_suppress_warnings() as sup: sup.filter( @@ -2951,9 +2296,9 @@ def percentile( ) # Replace NaNs with missing data - p = numpy_ma_masked_where(numpy_isnan(p), p, copy=False) + p = np.ma.masked_where(np.isnan(p), p, copy=False) else: - func = numpy_percentile + func = np.percentile p = func( array, ranks, @@ -2988,6 +2333,26 @@ def percentile( return out + @_inplace_enabled(default=False) + def persist(self, inplace=False): + """TODODASK. + + should this be called `to_memory`? This is part of the larger + scheme for memory management + + **Performance** + + `persist` causes all delayed operations to be computed. + + """ + d = _inplace_enabled_define_and_cleanup(self) + + dx = self.get_dask(copy=False) + dx = dx.persist() + d._set_dask(dx, reset_mask_hardness=False) + + return d + def loads(self, j, chunk=True): """Reset the data in place from a string serialization. @@ -3016,7 +2381,7 @@ def loads(self, j, chunk=True): # Convert dtype to numpy.dtype if "dtype" in d: - d["dtype"] = numpy_dtype(d["dtype"]) + d["dtype"] = np.dtype(d["dtype"]) # Convert units to Units if "units" in d: @@ -3171,19 +2536,8 @@ def dumpd(self): subarray["file"] = p_subarray.get_filename() subarray["shape"] = p_subarray.shape - # for attr in ('file', 'shape'): - # subarray[attr] = getattr(p_subarray, attr) - subarray["ncvar"] = p_subarray.get_ncvar() subarray["varid"] = p_subarray.get_varid() - # for attr in ('ncvar', 'varid'): - # value = getattr(p_subarray, attr, None) - # # value = getattr(p_subarray.array, attr, None) - # # p_subarray.array.inspect() - # - # if value is not None: - # subarray[attr] = value - # --- End: for if p_dtype != dtype: subarray["dtype"] = p_dtype @@ -3197,6 +2551,11 @@ def dumpd(self): # ---------------------------------------------------- attrs["format"] = "UM" + # TODOCFA: CFA only allows for one address. Surely(?) + # we only need the "header_offset", from + # which the "data_offset" and "disk_length" + # can be derived at read time. + subarray = {} for attr in ( "filename", @@ -3213,21 +2572,12 @@ def dumpd(self): attrs["subarray"] = subarray else: attrs["subarray"] = p_subarray - # attrs['subarray'] = p_subarray.array partitions.append(attrs) # --- End: for cfa_data["Partitions"] = partitions - # ------------------------------------------------------------ - # Auxiliary mask - # ------------------------------------------------------------ - if self._auxiliary_mask: - cfa_data["_auxiliary_mask"] = [ - m.copy() for m in self._auxiliary_mask - ] - return cfa_data def loadd(self, d, chunk=True): @@ -3284,36 +2634,33 @@ def loadd(self, d, chunk=True): self._shape = shape self._ndim = len(shape) - self._size = functools_reduce(operator_mul, shape, 1) + self._size = reduce(mul, shape, 1) cyclic = d.get("_cyclic", None) + # Never change the value of the _cyclic attribute in-place if cyclic: self._cyclic = cyclic.copy() else: self._cyclic = _empty_set HDF_chunks = d.get("_HDF_chunks", None) + # Never change the value of the _HDF_chunks attribute in-place if HDF_chunks: self._HDF_chunks = HDF_chunks.copy() else: self._HDF_chunks = None filename = d.get("file", None) - # if filename is not None: - - # filename = abspath(filename) base = d.get("base", None) - # if base is not None: - # base = abspath(base) # ------------------------------------------------------------ # Initialise an empty partition array # ------------------------------------------------------------ - partition_matrix = PartitionMatrix( - numpy_empty(d.get("_pmshape", ()), dtype=object), - list(d.get("_pmaxes", ())), - ) + partition_matrix = None # PartitionMatrix( + # np.empty(d.get("_pmshape", ()), dtype=object), + # list(d.get("_pmaxes", ())), + # ) pmndim = partition_matrix.ndim # ------------------------------------------------------------ @@ -3345,13 +2692,13 @@ def loadd(self, d, chunk=True): else: p_units = Units(p_units) - partition = Partition( - location=location, - axes=attrs.get("axes", axes)[:], - flip=attrs.get("flip", [])[:], - Units=p_units, - part=attrs.get("part", [])[:], - ) + partition = None # Partition( + # location=location, + # axes=attrs.get("axes", axes)[:], + # flip=attrs.get("flip", [])[:], + # Units=p_units, + # part=attrs.get("part", [])[:], + # ) fmt = attrs.get("format", None) if fmt is None: @@ -3377,9 +2724,7 @@ def loadd(self, d, chunk=True): kwargs["shape"] = tuple(kwargs["shape"]) kwargs["ndim"] = len(kwargs["shape"]) - kwargs["size"] = functools_reduce( - operator_mul, kwargs["shape"], 1 - ) + kwargs["size"] = reduce(mul, kwargs["shape"], 1) kwargs.setdefault("dtype", dtype) @@ -3413,14 +2758,117 @@ def loadd(self, d, chunk=True): if chunk: self.chunk() - # ------------------------------------------------------------ - # Auxiliary mask - # ------------------------------------------------------------ - _auxiliary_mask = d.get("_auxiliary_mask", None) - if _auxiliary_mask: - self._auxiliary_mask = [m.copy() for m in _auxiliary_mask] + def can_compute(self, functions=None, log_levels=None, override=False): + """TODODASK - this method is premature - needs thinking about as part + of the wider resource management issue + + Whether or not it is acceptable to compute the data. + + If the data is explicitly requested to be computed (as would + be the case when writing to disk, or accessing the `array` + attribute) then computation will always occur. + + This method is meant for cases when compution is desirable but + not essential, by providing an assessment of whether + computation would require too excessive resources (time, + memory, and CPU), if carried out. + + By default it is considered acceptable to compute the data if + the computed array fits in available memory and any of the + following are true, assessed in the order given up to the + first criterion satisfied: + + 1. The `force_compute` attribute is True. + + 2. The current log level is ``'DEBUG'``. + + 3. Any computations stored after initialisation consist only + subspace, concatenate, reshape, and copy functions. + + .. versionadded:: 4.0.0 + + .. seealso:: `force_compute`, `cf.log_level` + + :Parameters: + + functions: (sequence of) `str`, optional + Include the specified functions, in addition to the + defaults, as those that will allow + computation. Functions are identified by matching the + beginnings of the key names in the dask graph layers, + found with `dask.layers` attribute of the dask + array. See the *override* parameter. + + log_level: (sequence of) `str`, optional + Include the specified log levels, in addition to the + default, as those that will allow compuitation. See + the *override* parameter. + + override : `bool`, optional + If True then only compute the data for the given + *log_levels* (if any) and the given *functions* (if + any), ignoring the defaults. If the `force_compute` + attribute is True then computation occurs in any case. + + :Returns: + + `bool` + True if acceptable to compute the data, otherwise + False. + + """ + # TODODASK: Always return True for now, to aid development. + return True + + dx = self.get_dask(copy=False) + + # TODODASK fits in memory. + + # 1 Force compute + if self.force_compute: + return True + + # 2 Log levels + if override: + allowed_log_levels = () + allowed_functions = () else: - self._auxiliary_mask = None + allowed_log_levels = ("DEBUG",) + allowed_functions = ( + "array-", + "getitem-", + "copy-", + "concatenate-", + "reshape-", + ) + + if log_levels: + if isinstance(log_levels, str): + log_levels = (log_levels,) + + allowed_log_levels += tuple(log_levels) + + if log_level().value in allowed_log_levels: + return True + + # 3 Stored computations + layers = dx.dask.layers + if len(layers) == 1: + # No stored computations after initialisation + return True + + if functions: + if isinstance(functions, str): + functions = (functions,) + + allowed_functions += tuple(allowed_functions) + + return all( + [ + any([key.startswith(x) for x in allowed_functions]) + for key in tuple(layers)[1:] + ] + ) @_deprecated_kwarg_check("i") def ceil(self, inplace=False, i=False): @@ -3454,7 +2902,7 @@ def ceil(self, inplace=False, i=False): [-1. -1. -1. -1. 0. 1. 2. 2. 2.] """ - return self.func(numpy_ceil, out=True, inplace=inplace) + return self.func(np.ceil, out=True, inplace=inplace) @_inplace_enabled(default=False) def convolution_filter( @@ -3571,43 +3019,11 @@ def convolution_filter( `Data` or `None` The convolved data, or `None` if the operation was - in-place. - - **Examples:** - - >>> d = cf.Data(numpy.arange(12).reshape(3, 4), 'metres') - >>> print(d.array) - [[ 0, 1, 2, 3], - [ 4, 5, 6, 7], - [ 8, 9, 10, 11]]) - >>> d.cyclic() - set() - >>> e = d.convolution_filter([0.1, 0.5, 0.25], axis=1) - >>> print(e.array) - [[-- 0.7 1.55 --] - [-- 4.1 4.95 --] - [-- 7.5 8.35 --]] - >>> e = d.convolution_filter([0.1, 0.5, 0.25], axis=1, cval=0) - >>> print(e.array) - [[0.1 0.7 1.55 2. ] - [2.5 4.1 4.95 5. ] - [4.9 7.5 8.35 8. ]] - >>> e = d.convolution_filter([0.1, 0.5, 0.25], axis=1, mode='wrap') - >>> print(e.array) - [[0.85 0.7 1.55 2. ] - [4.25 4.1 4.95 5.4 ] - [7.65 7.5 8.35 8.8 ]] - >>> d.cyclic(1) - set() - >>> d.cyclic() - {1} - >>> e = d.convolution_filter([0.1, 0.5, 0.25], axis=1) - >>> print(e.array) - [[0.85 0.7 1.55 2. ] - [4.25 4.1 4.95 5.4 ] - [7.65 7.5 8.35 8.8 ]] + in-place. """ + # TODODSAK - map_overlap + try: scipy_convolve1d except NameError: @@ -3637,7 +3053,7 @@ def convolution_filter( # Set cval to NaN if it is currently None, so that the edges # will be filled with missing data if the mode is 'constant' if cval is None: - cval = numpy_nan + cval = np.nan # Section the data into sections up to a chunk in size sections = self.section([iaxis], chunks=True) @@ -3647,9 +3063,9 @@ def convolution_filter( for key, data in sections.items(): data.dtype = float input_array = data.array - masked = numpy_ma_is_masked(input_array) + masked = np.ma.is_masked(input_array) if masked: - input_array = input_array.filled(numpy_nan) + input_array = input_array.filled(np.nan) output_array = scipy_convolve1d( input_array, @@ -3659,9 +3075,9 @@ def convolution_filter( cval=cval, origin=origin, ) - if masked or (mode == "constant" and numpy_isnan(cval)): - with numpy_errstate(invalid="ignore"): - output_array = numpy_ma_masked_invalid(output_array) + if masked or (mode == "constant" and np.isnan(cval)): + with np.errstate(invalid="ignore"): + output_array = np.ma.masked_invalid(output_array) # --- End: if sections[key] = type(self)( @@ -3764,21 +3180,21 @@ def cumsum(self, axis, masked_as_zero=False, inplace=False): array = data.array filled = False - if masked_as_zero and numpy_ma_is_masked(array): + if masked_as_zero and np.ma.is_masked(array): mask = array.mask array = array.filled(0) filled = True - array = numpy_cumsum(array, axis=axis) + array = np.cumsum(array, axis=axis) if filled: size = array.shape[axis] shape = [1] * array.ndim shape[axis] = size - new_mask = numpy_cumsum(mask, axis=axis) == numpy_arange( + new_mask = np.cumsum(mask, axis=axis) == np.arange( 1, size + 1 ).reshape(shape) - array = numpy.ma.array(array, mask=new_mask, copy=False) + array = np.ma.array(array, mask=new_mask, copy=False) sections[key] = type(self)( array, units=self.Units, fill_value=self.fill_value @@ -3796,281 +3212,93 @@ def cumsum(self, axis, masked_as_zero=False, inplace=False): return out - def _chunk_add_partitions(self, d, axes): - """Create new partitions and add them to `d` in-place.""" - for axis in axes[::-1]: - extra_bounds = d.get(axis) + @_inplace_enabled(default=False) + def rechunk( + self, + chunks=_DEFAULT_CHUNKS, + threshold=None, + block_size_limit=None, + balance=False, + inplace=False, + ): + """Convert blocks in the dask array for new chunks. - if not extra_bounds: - continue + See `dask.array.rechunk`for more details. - if axis not in self.partitions.axes: - # print('self.partitions.matrix.shape=', - # self.partitions.matrix.shape, axis) - # Create a new partition axis - self.partitions.insert_dimension(axis, inplace=True) - # print('self.partitions.matrix.shape=', - # self.partitions.matrix.shape) + .. versionadded:: 4.0.0 - # Create the new partitions - self.add_partitions(sorted(set(extra_bounds)), axis) + .. seealso:: `chunks` - # Update d in-place - d[axis] = [] + :Parameters: - def chunk(self, chunksize=None, total=None, omit_axes=None, pmshape=None): - """Partition the data array. + chunks: `int`, `tuple`, `dict` or `str`, optional + The new block dimensions to create. ``-1`` indicates the full + size of the corresponding dimension. Default is ``"auto"`` + which automatically determines chunk sizes. - :Parameters: + threshold: `int`, optional + The graph growth factor under which we don't bother introducing an + intermediate step. - chunksize: `int`, optional - The + block_size_limit: `int`, optional + The maximum block size (in bytes) we want to produce Defaults + to the configuration value ``dask.array.chunk-size`` - total: sequence of `int`, optional + TODODASK - how to use/import dask config items?? - omit_axes: sequence of `int`, optional + balance: `bool`, optional + If True, try to make each chunk to be the same size. By + default this is not attempted. - pmshape: sequence of `int`, optional + This means ``balance=True`` will remove any small leftover + chunks, so using ``x.rechunk(chunks=len(x) // N, + balance=True)`` will almost certainly result in ``N`` chunks. :Returns: - `None` + TODODASK **Examples:** - >>> d.chunk() - >>> d.chunk(100000) - >>> d.chunk(100000, ) - >>> d.chunk(100000, total=[2]) - >>> d.chunk(100000, omit_axes=[3, 4]) - - """ - if not chunksize: - # Set the default chunk size - chunksize = cf_chunksize() - - # TODO - check intger division for python3 - - # Define the factor which, when multiplied by the size of a - # partition, determines how many chunks are in the partition. - factor = (self.dtype.itemsize + 1.0) / chunksize - - if not total and self._size * factor <= 1: - # Don't do any partitioning if the data array is already - # smaller than the chunk size. Note: - # self._size*factor=(no. bytes in array)/(no. bytes in a - # chunk) - return - - # Initialise the dictionary relating each axis to new - # partition boundaries for that axis. - # - # E.g. {'dim0': [], 'dim1': []} - axes = self._axes - d = {} - for axis in axes: - d[axis] = [] - - shape = self._shape - - if total: - if omit_axes: - omit_axes = list(omit_axes) - else: - omit_axes = [] - - for i in sorted(total): - if i in omit_axes: - raise ValueError( - "Chunking error: Axis {} can't be specified by " - "both 'total' and 'omit_axes' keywords".format(i) - ) - - omit_axes.append(i) - - d[axes[i]] = list(range(1, shape[i])) - # --- End: for - - self._chunk_add_partitions(d, axes) - # --- End: if - - if pmshape: - if len(pmshape) != self._ndim: - raise ValueError("Bad pmshape {}".format(pmshape)) + >>> x = cf.Data.ones((1000, 1000), chunks=(100, 100)) - if self._pmsize > 1: - raise ValueError( - "Can't set pmshape when there is more than one " - "partition: {}".format(self._pmsize) - ) - - # shape = self._shape - for i, n_chunks in enumerate(pmshape): - axis_size = shape[i] - - if axis_size == 1: - if n_chunks != 1: - raise ValueError("Bad shape: {}".format(pmshape)) - continue - - if n_chunks == 1: - continue - - axis = axes[i] - - step = int(axis_size / n_chunks) - if step < 1: - raise ValueError("Bad shape: {}".format(pmshape)) - - d[axis] = list(range(step, axis_size, step)) - - if len(d[axis]) + 1 != n_chunks: - raise ValueError( - "Bad partition matrix shape: {} {} : {}".format( - len(d[axis]) + 1, n_chunks, d[axis] - ) - ) - - if n_chunks <= 1: - break - # --- End: for - - self._chunk_add_partitions(d, axes) - - return - # --- End: if + Specify uniform chunk sizes with a tuple - # Still here? - order = list(range(self._ndim)) + >>> y = x.rechunk((1000, 10)) - if omit_axes: - # Do not chunk particular axes - order = [i for i in order if i not in omit_axes] + Or chunk only specific dimensions with a dictionary - while order: # Only enter if there are axes to chunk + >>> y = x.rechunk({0: 1000}) - (largest_partition_size, largest_partition) = sorted( - [ - (partition.size, partition) - for partition in self.partitions.matrix.flat - ], - key=itemgetter(0), - )[-1] + Use the value ``-1`` to specify that you want a single chunk along + a dimension or the value ``"auto"`` to specify that dask can + freely rechunk a dimension to attain blocks of a uniform block + size - # n_chunks = number of equal sized bits that the - # partition needs to be split up into so - # that each bit's size is less than the - # chunk size. - n_chunks = int(largest_partition_size * factor + 0.5) + >>> y = x.rechunk({0: -1, 1: 'auto'}, block_size_limit=1e8) - if n_chunks <= 1: - break + If a chunk size does not divide the dimension then rechunk will + leave any unevenness to the last chunk. - # Loop round the master array axes in the order - # specified. This order will be range(0, self.dim) - # unless the total parameter has been set, in which - # case the total axes will be looped through first. - for i in order: - axis_size = largest_partition.shape[i] - if axis_size == 1: - continue + >>> x.rechunk(chunks=(400, -1)).chunks + ((400, 400, 200), (1000,)) - axis = axes[i] + However if you want more balanced chunks, and don't mind Dask + choosing a different chunksize for you then you can use the + ``balance=True`` option. - location = largest_partition.location[i] + >>> x.rechunk(chunks=(400, -1), balance=True).chunks + ((500, 500), (1000,)) - if axis_size <= n_chunks: - d[axis] = list(range(location[0] + 1, location[1])) - n_chunks = int(math_ceil(float(n_chunks) / axis_size)) - else: - step = int(axis_size / n_chunks) - d[axis] = list( - range(location[0] + step, location[1], step) - ) - break + """ + d = _inplace_enabled_define_and_cleanup(self) - if n_chunks <= 1: - break - # --- End: for + dx = d.get_dask(copy=False) + dx = dx.rechunk(chunks, threshold, block_size_limit, balance) - self._chunk_add_partitions(d, axes) - # --- End: while + d._set_dask(dx, delete_source=False, reset_mask_hardness=False) - # # ------------------------------------------------------------ - # # Find any new partition boundaries for each axis - # # ------------------------------------------------------------ - # for indices in x: - # for partition in self.partitions.matrix[indices].flat: - # - # # n_chunks = number of equal sized bits that the - # # partition needs to be split up into so - # # that each bit's size is less than the - # # chunk size. - # n_chunks = int(partition.size*factor + 0.5) - # - # if not total and not pmshape and n_chunks <= 1: - # continue - # - # # Loop round the master array axes in the order - # # specified. This order will be range(0, self.dim) - # # unless the total parameter has been set, in which - # # case the total axes will be looped through first. - # for i in order: - # - # axis_size = partition.shape[i] - # if axis_size == 1: - # if pmshape and pmshape[i] != 1: - # raise ValueError("Bad pmshape {}".format(pmshape)) - # continue - # - # axis = axes[i] - # - # if pmshape: - # n_chunks = pmshape[i] - # elif total and i in total: - # # This axis has been flagged for maximal - # # partitioning - # if axis_size > n_chunks: - # n_chunks = axis_size - # - # if n_chunks <= 1: - # continue - # - # location = partition.location[i] - # - # if axis_size <= n_chunks: - # d[axis].extend(range(location[0]+1, location[1])) - # n_chunks = int(math_ceil(float(n_chunks)/axis_size)) - # else: - # step = int(axis_size/n_chunks) - # new_partition_boundaries = range( - # location[0]+step, location[1], step) - # d[axis].extend(new_partition_boundaries) - # - # if not pmshape: - # break - # elif len(new_partition_boundaries) + 1 != n_chunks: - # raise ValueError("Bad pmshape {}".format(pmshape)) - # # --- End: for - # # --- End: for - # # --- End: for - # - # # ------------------------------------------------------------ - # # Create any new partition boundaries for each axis - # # ------------------------------------------------------------ - # for axis in axes[::-1]: - # extra_bounds = d.get(axis) - # - # if extra_bounds is None: - # continue - # - # if axis not in self.partitions.axes: - # # Create a new partition axis - # self.partitions.expand_dims(axis, i=True) - # - # # Create the new partitions - # self.add_partitions(sorted(set(extra_bounds)), axis) - # # --- End: for + return d @_inplace_enabled(default=False) def _asdatetime(self, inplace=False): @@ -4391,12 +3619,7 @@ def _combined_units(self, data1, method, inplace): elif not units0: # units0 is undefined return data0, data1, getattr(_units_1, method)(units1) - # elif units0.equivalent(units1) and not units0.equals(units1): - # # Both units are defined and equivalent but not equal - # data1 = data1.copy() - # data1.Units = units0 - # return data0, data1, getattr( - # units0, method)(units0)# !!!!!!! units0*units0 YOWSER + # !!!!!!! units0*units0 YOWSER else: # Both units are defined (note: if the units are # noncombinable then this will raise an exception) @@ -4657,22 +3880,10 @@ def _binary_operation(self, other, method): other = type(self).asdata(other) - # if other._isdt and self.Units.isreftime: - # # Make sure that an array of date-time objects has the - # # right calendar. - # other.override_units(self.Units, i=True) - # if other._isdt and self.Units.isreftime: - # # Make sure that an array of date-time objects has the - # # right calendar. - # other.override_units(self.Units, i=True) - data0 = self.copy() data0, other, new_Units = data0._combined_units(other, method, True) - # calendar_arithmetic = (data0.Units.isreftime and - # other.Units.iscalendartime) - # ------------------------------------------------------------ # Bring other into memory, if appropriate. # ------------------------------------------------------------ @@ -4754,7 +3965,7 @@ def _binary_operation(self, other, method): new_axes = [] existing_axes = self._all_axis_names() for n in new_shape: - axis = data0._new_axis_identifier(existing_axes) + axis = new_axis_identifier(existing_axes) existing_axes.append(axis) new_axes.append(axis) # --- End: for @@ -4785,47 +3996,13 @@ def _binary_operation(self, other, method): broadcast_indices.append(slice(None)) - new_size = functools_reduce(operator_mul, new_shape, 1) + new_size = reduce(mul, new_shape, 1) dummy_location = [None] * new_ndim # ---End: if new_flip = [] - # if broadcasting: - # max_size = 0 - # for partition in data0.partitions.matrix.flat: - # indices0 = partition.indices - # indices1 = tuple([ - # (index if not broadcast_index else broadcast_index) - # for index, broadcast_index in zip( - # indices0[align_offset:], broadcast_indices) - # ]) - # indices1 = (Ellipsis,) + indices - # - # shape0 = partition.shape - # shape1 = [index.stop - index.start - # for index in parse_indices(other, indices1)] - # - # broadcast_size = 1 - # for n0, n1 in izip_longest( - # shape0[::-1], shape1[::-1], fillvalue=1): - # if n0 > 1: - # broadcast_size *= n0 - # else: - # broadcast_size *= n1 - # # --- End: for - # - # if broadcast_size > max_size: - # max_size = broadcast_size - # # --- End: for - # - # chunksize = cf_chunksize() - # ffff = max_size*(new_dtype.itemsize + 1) - # if ffff > chunksize: - # data0.chunk(chunksize*(chunksize/ffff)) - # # --- End: if - # ------------------------------------------------------------ # Create a Data object which just contains the metadata for # the result. If we're doing a binary arithmetic operation @@ -4834,41 +4011,24 @@ def _binary_operation(self, other, method): # with this new metadata. # ------------------------------------------------------------ - # if new_shape != data0_shape: - # set_location_map = True - # new_size = self._size - # dummy_location = [None] * new_ndim - # else: - # set_location_map = False - # new_size = functools_reduce(mul, new_shape, 1) - - # if not set_location_map: - # new_size = functools_reduce(mul, new_shape, 1) - # else: - # new_size = self._size - result = data0.copy() result._shape = new_shape result._ndim = new_ndim result._size = new_size result._axes = new_axes - # result._flip = new_flip() - - # Is the result an array of date-time objects? - # new_isdt = data0._isdt and new_Units.isreftime # ------------------------------------------------------------ # Set the data-type of the result # ------------------------------------------------------------ if method_type in ("_eq", "_ne", "_lt", "_le", "_gt", "_ge"): - new_dtype = numpy_dtype(bool) + new_dtype = np.dtype(bool) rtol = self._rtol atol = self._atol else: if "true" in method: - new_dtype = numpy_dtype(float) + new_dtype = np.dtype(float) elif not inplace: - new_dtype = numpy_result_type(data0.dtype, other.dtype) + new_dtype = np.result_type(data0.dtype, other.dtype) else: new_dtype = data0.dtype # --- End: if @@ -4877,35 +4037,9 @@ def _binary_operation(self, other, method): # Set flags to control whether or not the data of result and # self should be kept in memory # ------------------------------------------------------------ - # keep_result_in_memory = result.fits_in_memory(new_dtype.itemsize) - # keep_self_in_memory = data0.fits_in_memory(data0.dtype.itemsize) - # if not inplace: - # # When doing a binary arithmetic operation we need to - # # decide whether or not to keep self's data in memory - # revert_to_file = True - # save_self = not data0.fits_in_memory(data0.dtype.itemsize) - # keep_self_in_memory = data0.fits_in_memory(data0.dtype.itemsize) - # else: - # # When doing an augmented arithmetic assignment we don't - # # need to keep self's original data in memory - # revert_to_file = False - # save_self = False - # keep_self_in_memory = True - - # dimensions = self._axes - # direction = self.direction - # units = self.Units - config = data0.partition_configuration(readonly=not inplace) - # print('config[readonly] =', config['readonly']) - - # if calendar_arithmetic: - # pda_args['func'] = rt2dt - # pda_args['update'] = False - # pda_args['dtype'] = None - - original_numpy_seterr = numpy_seterr(**_seterr) + original_numpy_seterr = np.seterr(**_seterr) # Think about dtype, here. @@ -4944,10 +4078,6 @@ def _binary_operation(self, other, method): # -------------------------------------------------------- # Do the binary operation on this partition's data # -------------------------------------------------------- - # if calendar_arithmetic: - # pass - # else: - try: if method == "__eq__": # and data0.Units.isreftime: array0 = _numpy_isclose( @@ -4958,22 +4088,17 @@ def _binary_operation(self, other, method): array0, array1, rtol=rtol, atol=atol ) else: - # print(method) - # print(repr(array0)) - # print(repr(array1)) - # print() array0 = getattr(array0, method)(array1) - # try: - # array0 = getattr(array0, method)(array1) + except FloatingPointError as error: # Floating point point errors have been trapped if _mask_fpe[0]: # Redo the calculation ignoring the errors and # then set invalid numbers to missing data - numpy_seterr(**_seterr_raise_to_ignore) + np.seterr(**_seterr_raise_to_ignore) array0 = getattr(array0, method)(array1) - array0 = numpy_ma_masked_invalid(array0, copy=False) - numpy_seterr(**_seterr) + array0 = np.ma.masked_invalid(array0, copy=False) + np.seterr(**_seterr) else: # Raise the floating point error exception raise FloatingPointError(error) @@ -4982,7 +4107,7 @@ def _binary_operation(self, other, method): raise TypeError( "Incompatible result data-type ({0!r}) for " "in-place {1!r} arithmetic".format( - numpy_result_type(array0.dtype, array1.dtype).name, + np.result_type(array0.dtype, array1.dtype).name, array0.dtype.name, ) ) @@ -4991,14 +4116,14 @@ def _binary_operation(self, other, method): # --- End: try if array0 is NotImplemented: - array0 = numpy_zeros(partition.shape, dtype=bool) - elif not array0.ndim and not isinstance(array0, numpy_ndarray): - array0 = numpy_asanyarray(array0) + array0 = np.zeros(partition.shape, dtype=bool) + elif not array0.ndim and not isinstance(array0, np.ndarray): + array0 = np.asanyarray(array0) if not inplace: p_datatype = array0.dtype if new_dtype != p_datatype: - new_dtype = numpy_result_type(p_datatype, new_dtype) + new_dtype = np.result_type(p_datatype, new_dtype) partition.subarray = array0 partition.Units = new_Units @@ -5019,7 +4144,7 @@ def _binary_operation(self, other, method): # --- End: for # Reset numpy.seterr - numpy_seterr(**original_numpy_seterr) + np.seterr(**original_numpy_seterr) source = result.source(None) if source is not None and source.get_compression_type(): @@ -5065,64 +4190,14 @@ def __query_set__(self, values): return out - # new = self.copy() - # - # pda_args = new.pda_args(revert_to_file=True) - # - # for partition in new.partitions.matrix.flat: - # array = partition.dataarray(**pda_args) - # - # i = iter(values) - # value = next(i) - # out = (array == value) - # for value in i: - # out |= (array == value) - # - # partition.subarray = out - # partition.close() - # # --- End: for - # - # new.dtype = bool - # - # return new - def __query_wi__(self, value): """Implements the “within a range” condition.""" return (self >= value[0]) & (self <= value[1]) - # new = self.copy() - # - # pda_args = new.pda_args(revert_to_file=True) - # - # for partition in new.partitions.matrix.flat: - # array = partition.dataarray(**pda_args) - # print(array, new.Units, type(value0), value1) - # partition.subarray = (array >= value0) & (array <= value1) - # partition.close() - # # --- End: for - # - # new.dtype = bool - # - # return new - def __query_wo__(self, value): - """Implements the “without a range” condition.""" + """TODO.""" return (self < value[0]) | (self > value[1]) - # new = self.copy() - # - # pda_args = new.pda_args(revert_to_file=True) - # - # for partition in new.partitions.matrix.flat: - # array = partition.dataarray(**pda_args) - # partition.subarray = (array < value0) | (array > value1) - # partition.close() - # # --- End: for - # - # new.dtype = bool - # - # return new - @classmethod def concatenate(cls, data, axis=0, _preserve=True): """Join a sequence of data arrays together. @@ -5388,7 +4463,7 @@ def concatenate(cls, data, axis=0, _preserve=True): new_pmshape[0] += matrix1.shape[0] # Initialise an empty partition matrix with the new shape - new_matrix = numpy_empty(new_pmshape, dtype=object) + new_matrix = np.empty(new_pmshape, dtype=object) # Insert the data0 partition matrix new_matrix[: matrix0.shape[0]] = matrix0 @@ -5404,7 +4479,7 @@ def concatenate(cls, data, axis=0, _preserve=True): # ------------------------------------------------------------ # 7. Update the size, shape and dtype of data0 # ------------------------------------------------------------ - original_shape0 = data0._shape + # original_shape0 = data0._shape data0._size += data1._size @@ -5415,66 +4490,7 @@ def concatenate(cls, data, axis=0, _preserve=True): dtype0 = data0.dtype dtype1 = data1.dtype if dtype0 != dtype1: - data0.dtype = numpy_result_type(dtype0, dtype1) - - # -------------------------------------------------------- - # 8. Concatenate the auxiliary mask - # -------------------------------------------------------- - new_auxiliary_mask = [] - if data0._auxiliary_mask: - # data0 has an auxiliary mask - for mask in data0._auxiliary_mask: - size = mask.size - if (size > 1 and mask.shape[axis] > 1) or ( - size == 1 and mask.datum() - ): - new_shape = list(mask.shape) - new_shape[axis] = shape0[axis] - new_mask = cls.empty(new_shape, dtype=bool) - indices = [slice(None)] * new_mask.ndim - - indices[axis] = slice(0, original_shape0[axis]) - new_mask[tuple(indices)] = mask - - indices[axis] = slice(original_shape0[axis], None) - new_mask[tuple(indices)] = False - else: - new_auxiliary_mask.append(mask) - - new_auxiliary_mask.append(new_mask) - # --- End: for - - if data1._auxiliary_mask: - # data1 has an auxiliary mask - for mask in data1._auxiliary_mask: - size = mask.size - if (size > 1 and mask.shape[axis] > 1) or ( - size == 1 and mask.datum() - ): - new_shape = list(mask.shape) - new_shape[axis] = shape0[axis] - new_mask = cls.empty(new_shape, dtype=bool) - - indices = [slice(None)] * new_mask.ndim - - indices[axis] = slice(0, original_shape0[axis]) - new_mask[tuple(indices)] = False - - indices[axis] = slice(original_shape0[axis], None) - new_mask[tuple(indices)] = mask - else: - new_auxiliary_mask.append(mask) - - new_auxiliary_mask.append(new_mask) - # --- End: for - # --- End: if - - if new_auxiliary_mask: - data0._auxiliary_mask = new_auxiliary_mask - # # Set the concatenated auxiliary mask - # for mask in new_auxiliary_mask: - # data0._auxiliary_mask_add_component(mask) - # --- End: for + data0.dtype = np.result_type(dtype0, dtype1) # ------------------------------------------------------------ # Done @@ -5504,54 +4520,8 @@ def _move_flip_to_partitions(self): partition.flip = p_flip # --- End: for - # self._flip = [] self._flip([]) - # def _parse_axes(self, axes, method=None): - # ''' - # - # :Parameters: - # - # axes: (sequence of) `int` - # The axes of the data array. May be one of, or a sequence of - # any combination of zero or more of: - # - # * The integer position of a dimension in the data array - # (negative indices allowed). - # - # method: `str` - # - # :Returns: - # - # `list` - # - # **Examples:** - # - # ''' - # ndim = self._ndim - # - # if isinstance(axes, int): - # axes = (axes,) - # - # axes2 = [] - # for axis in axes: - # if 0 <= axis < ndim: - # axes2.append(axis) - # elif -ndim <= axis < 0: - # axes2.append(axis + ndim) - # else: - # raise ValueError( - # "Invalid axis: {!r}".format(method, axis)) - # # --- End: for - # - # # Check for duplicate axes - # n = len(axes2) - # if n > 1 and n > len(set(axes2)): - # raise ValueError("Can't {}: Duplicate axis: {}".format( - # method, axes2)) - # - # return axes2 - def _unary_operation(self, operation): """Implement unary arithmetic operations. @@ -5587,19 +4557,14 @@ def _unary_operation(self, operation): [[1 2 3 4 5]] """ - self.to_memory() - - new = self.copy() + out = self.copy(array=False) - config = new.partition_configuration(readonly=True) + dx = self.get_dask(copy=False) + dx = getattr(operator, operation)(dx) - for partition in new.partitions.matrix.flat: - partition.open(config) - array = partition.array - partition.subarray = getattr(operator, operation)(array) - partition.close() + out._set_dask(dx, reset_mask_hardness=False) - return new + return out def __add__(self, other): """The binary arithmetic operation ``+`` @@ -6022,89 +4987,6 @@ def __pos__(self): """ return self._unary_operation("__pos__") - def _all_axis_names(self): - """Return a set of all the dimension names in use by the data - array. - - Note that the output set includes dimensions of individual - partitions which are not dimensions of the master data array. - - :Returns: - - `list` of `str` - The axis names. - - **Examples:** - - >>> d._axes - ['dim1', 'dim0'] - >>> d.partitions.info('_dimensions') - [['dim0', 'dim0'], - ['dim1', 'dim0', 'dim2']] - >>> d._all_axis_names() - ['dim2', dim0', 'dim1'] - - """ - all_axes = self._all_axes - if not all_axes: - return list(self._axes) - else: - return list(all_axes) - - def _change_axis_names(self, axis_map): - """Change the axis names. - - The axis names are arbitrary, so mapping them to another - arbitrary collection does not change the data array values, - units, nor axis order. - - """ - # Find any axis names which are not mapped. If there are any, - # then update axis_map. - all_axes = self._all_axes - if all_axes: - d = set(all_axes).difference(axis_map) - if d: - axis_map = axis_map.copy() - existing_axes = list(all_axes) - for axis in d: - if axis in axis_map.values(): - axis_map[axis] = self._new_axis_identifier( - existing_axes - ) - existing_axes.append(axis) - else: - axis_map[axis] = axis - # --- End: if - - if all([axis0 == axis1 for axis0, axis1 in axis_map.items()]): - # Return without doing anything if the mapping is null - return - - # Axes - self._axes = [axis_map[axis] for axis in self._axes] - - # All axes - if all_axes: - self._all_axes = tuple([axis_map[axis] for axis in all_axes]) - - # Flipped axes - # flip = self._flip - flip = self._flip() - if flip: - self._flip([axis_map[axis] for axis in flip]) - # self._flip = [axis_map[axis] for axis in flip] - - # HDF chunks - chunks = self._HDF_chunks - if chunks: - self._HDF_chunks = dict( - [(axis_map[axis], size) for axis, size in chunks.items()] - ) - - # Partitions in the partition matrix - self.partitions.change_axis_names(axis_map) - @_deprecated_kwarg_check("i") @_inplace_enabled(default=False) def _collapse( @@ -6204,7 +5086,7 @@ def _collapse( # shape of the data array then the weights are assumed # to span the collapse axes in the order in which they # are given - if numpy_shape(weights) == self_shape: + if np.shape(weights) == self_shape: weights = {tuple(self_axes): weights} else: weights = {tuple([self_axes[i] for i in axes]): weights} @@ -6229,12 +5111,12 @@ def _collapse( # --- End: if for key, weight in tuple(weights.items()): - if weight is None or numpy_size(weight) == 1: + if weight is None or np.size(weight) == 1: # Ignore undefined weights and size 1 weights del weights[key] continue - weight_ndim = numpy_ndim(weight) + weight_ndim = np.ndim(weight) if weight_ndim != len(key): raise ValueError( "Can't collapse: Incorrect number of weights " @@ -6247,11 +5129,11 @@ def _collapse( "axes (%d > %d)" % (weight.ndim, ndim) ) - for n, axis in zip(numpy_shape(weight), key): + for n, axis in zip(np.shape(weight), key): if n != self_shape[self_axes.index(axis)]: raise ValueError( "Can't collapse: Incorrect weights " - "shape {!r}".format(numpy_shape(weight)) + "shape {!r}".format(np.shape(weight)) ) # --- End: for @@ -6295,28 +5177,10 @@ def _collapse( # Add the weights to kwargs kwargs["weights"] = weights - # for key, weight in tuple(weights.items()): - # key = set(key) - # if len(key) > n_collapse_axes and key.issuperset(axes): - # shape = tuple(self.shape[i] for i in axes) - # raise ValueError( - # "Weights {!r} span too many axes. Expected " - # "weights shape to broadcast to {}".format( - # weight, shape) - # ) - # - # if key.difference(axes): - # raise ValueError( - # 'Weights {!r} span a non-collapse axis.'.format( - # weight) - # ) - # --- End: for - # If the input data array 'fits' in one chunk of memory, then # make sure that it has only one partition if ( - not mpi_on - and not _preserve_partitions + not _preserve_partitions and d._pmndim and d.fits_in_one_chunk_in_memory(d.dtype.itemsize) ): @@ -6327,7 +5191,7 @@ def _collapse( # ------------------------------------------------------------- new = d[(Ellipsis,) + (0,) * n_collapse_axes] - new._auxiliary_mask = None + # new._auxiliary_mask = None for partition in new.partitions.matrix.flat: # Do this so as not to upset the ref count on the # parittion's of d @@ -6354,64 +5218,8 @@ def _collapse( readonly=False, auxiliary_mask=None, extra_memory=False # DCH ??x ) - if mpi_on: - mode = collapse_parallel_mode() - if mode == 0: - # Calculate the number of partitions in each subspace, - # assuming this will always be the same in each one and - # compare to the number of partitions in the new partition - # matrix times the maximum number of partitions per process in - # each case. The latter is calculated by - # _flag_partitions_for_processing - new._flag_partitions_for_processing() - partition = new.partitions.matrix.item( - (0,) * new._pmndim - ) # "first" partition of new - indices = partition.indices[:n_non_collapse_axes] + c_slice - data = d[indices] - data._flag_partitions_for_processing() - n_data = data.partitions.matrix.size - n_new = new.partitions.matrix.size - - if ( - new._max_partitions_per_process * n_data - > data._max_partitions_per_process * n_new - ): - # "turn on" parallelism in _collapse_subspace - _parallelise_collapse_sub = True - # "turn off" parallelism in _collapse - _parallelise_collapse = False - else: - # "turn off" parallelism in _collapse_subspace - _parallelise_collapse_sub = False - # "turn on" parallelism in _collapse - _parallelise_collapse = True - # --- End: if - elif mode == 1: - # "turn off" parallelism in _collapse_subspace - _parallelise_collapse_sub = False - # "turn on" parallelism in _collapse - _parallelise_collapse = True - elif mode == 2: - # "turn on" parallelism in _collapse_subspace - _parallelise_collapse_sub = True - # "turn off" parallelism in _collapse - _parallelise_collapse = False - else: - raise ValueError("Invalid collapse parallel mode") - # --- End: if - else: - # "turn off" parallelism in both functions - _parallelise_collapse_sub = False - _parallelise_collapse = False - - # Flag which partitions will be processed on this rank. If - # _parallelise_collapse is False then all partitions will be - # flagged for processing. - new._flag_partitions_for_processing(_parallelise_collapse) - processed_partitions = [] - for pmindex, partition in numpy_ndenumerate(new.partitions.matrix): + for pmindex, partition in np.ndenumerate(new.partitions.matrix): if partition._process_partition: # Only process the partition if it is flagged partition.open(config) @@ -6444,7 +5252,7 @@ def _collapse( Nmax, mtol, _preserve_partitions=_preserve_partitions, - _parallelise_collapse_subspace=_parallelise_collapse_sub, + _parallelise_collapse_subspace=False, **kwargs, ) @@ -6462,7 +5270,7 @@ def _collapse( # are distributed to every rank and processed_partitions now # contains all the processed partitions from every rank. processed_partitions = self._share_partitions( - processed_partitions, _parallelise_collapse + processed_partitions, False ) # Put the processed partitions back in the partition matrix @@ -6473,17 +5281,9 @@ def _collapse( p_datatype = partition.subarray.dtype if datatype != p_datatype: - datatype = numpy_result_type(p_datatype, datatype) + datatype = np.result_type(p_datatype, datatype) # --- End: for - # Share the lock files created by each rank for each partition - # now in a temporary file so that __del__ knows which lock - # files to check if present - new._share_lock_files(_parallelise_collapse) - - new._all_axes = None - # new._flip = [] - new._flip([]) new._Units = new_units new.dtype = datatype @@ -6569,8 +5369,7 @@ def _collapse_subspace( # If the input data array 'fits' in one chunk of memory, then # make sure that it has only one partition if ( - not mpi_on - and not _preserve_partitions + not _preserve_partitions and data._pmndim and data.fits_in_memory(data.dtype.itemsize) ): @@ -6642,7 +5441,7 @@ def _collapse_subspace( if wmin == 0: # Mask the array where the weights are zero - array = numpy_ma_masked_where(w == 0, array, copy=True) + array = np.ma.masked_where(w == 0, array, copy=True) if array.mask.all(): # The array is all missing data partition.close() @@ -6660,12 +5459,8 @@ def _collapse_subspace( shape = array.shape ndim = array.ndim new_shape = shape[:n_non_collapse_axes] - new_shape += ( - functools_reduce( - operator_mul, shape[n_non_collapse_axes:] - ), - ) - array = numpy_reshape(array.copy(), new_shape) + new_shape += (reduce(mul, shape[n_non_collapse_axes:]),) + array = np.reshape(array.copy(), new_shape) if weights is not None: w = kwargs["weights"] @@ -6674,7 +5469,7 @@ def _collapse_subspace( # opposed to spanning all axes) new_shape = (w.size,) - kwargs["weights"] = numpy_reshape(w, new_shape) + kwargs["weights"] = np.reshape(w, new_shape) # --- End: if p_out = func(array, masked=p_masked, **kwargs) @@ -6698,193 +5493,19 @@ def _collapse_subspace( # --- End: if # --- End: for - if _parallelise_collapse_subspace: - # Aggregate the outputs of each rank using the group=True - # keyword on fpartial on rank 0 only - for rank in range(1, mpi_size): - if mpi_rank == rank: - if out is None: - # out is None, so will not be sent - out_is_none = True - mpi_comm.send(out_is_none, dest=0) - else: - out_is_none = False - mpi_comm.send(out_is_none, dest=0) - out_props = [] - for item in out: - item_props = {} - if isinstance( - item, numpy_ndarray - ) and item.dtype.kind in {"b", "i", "u", "f", "c"}: - # The item is a supported numpy array, - # so can be sent without pickling it. - item_props["is_numpy_array"] = True - item_props["isMA"] = numpy_ma_isMA(item) - if item_props["isMA"]: - item_props["is_masked"] = ( - item.mask is not numpy_ma_nomask - ) - else: - item_props["is_masked"] = False - # --- End: if - item_props["shape"] = item.shape - item_props["dtype"] = item.dtype - else: - # The item is either not a numpy array - # or is, for example, an array of - # strings, so will be pickled when - # sent. - item_props["is_numpy_array"] = False - # --- End: if - out_props.append(item_props) - # --- End: for - - # Send information about the properties of - # each item in out so that it can be received - # correctly. - mpi_comm.send(out_props, dest=0) - - # Send each item in out to process 0 in the - # appropriate way. - for item, item_props in zip(out, out_props): - if item_props["is_numpy_array"]: - if item_props["is_masked"]: - mpi_comm.Send(item.data, dest=0) - mpi_comm.Send(item.mask, dest=0) - elif item_props["isMA"]: - mpi_comm.Send(item.data, dest=0) - else: - mpi_comm.Send(item, dest=0) - # --- End: if - else: - mpi_comm.send(item, dest=0) - # --- End: if - # --- End: for - elif mpi_rank == 0: - p_out_is_none = mpi_comm.recv(source=rank) - if p_out_is_none: - # p_out is None so there is nothing to do - continue - else: - # Receive information about the properties of - # p_out. - p_out_props = mpi_comm.recv(source=rank) - - # Receive each item in p_out in the correct - # way according to its properties. - p_out = [] - for item_props in p_out_props: - if item_props["is_numpy_array"]: - if item_props["is_masked"]: - item = numpy_ma_masked_all( - item_props["shape"], - dtype=item_props["dtype"], - ) - mpi_comm.Recv(item.data, source=rank) - mpi_comm.Recv(item.mask, source=rank) - elif item_props["isMA"]: - item = numpy_ma_empty( - item_props["shape"], - dtype=item_props["dtype"], - ) - mpi_comm.Recv(item.data, source=rank) - else: - item = numpy_empty( - item_props["shape"], - dtype=item_props["dtype"], - ) - mpi_comm.Recv(item, source=rank) - else: - item = mpi_comm.recv(source=rank) - # --- End: if - p_out.append(item) - # --- End: for - p_out = tuple(p_out) - - # Aggregate out and p_out if out is not None. - if out is None: - out = p_out - else: - out = fpartial(out, p_out, group=True) - # --- End: if - # --- End: if - # --- End: for - # --- End: if - - # Finalise - sub_samples = mpi_comm.gather(sub_samples, root=0) - if mpi_rank == 0: - sub_samples = sum(sub_samples) - out = self._collapse_finalise( - ffinalise, - out, - sub_samples, - masked, - Nmax, - mtol, - data, - n_non_collapse_axes, - ) - # --- End: if - - # Broadcast the aggregated result back from process 0 to - # all processes. - - # First communicate information about the result's - # properties. - if mpi_rank == 0: - out_props = {} - out_props["isMA"] = numpy_ma_isMA(out) - if out_props["isMA"]: - out_props["is_masked"] = out.mask is not numpy_ma_nomask - else: - out_props["is_masked"] = False - # --- End: if - out_props["shape"] = out.shape - out_props["dtype"] = out.dtype - else: - out_props = None - # --- End: if - out_props = mpi_comm.bcast(out_props, root=0) - - # Do the broadcast. - if out_props["is_masked"]: - if mpi_rank != 0: - out = numpy_ma_masked_all( - out_props["shape"], dtype=out_props["dtype"] - ) - # --- End: if - mpi_comm.Bcast(out.data, root=0) - mpi_comm.Bcast(out.mask, root=0) - elif out_props["isMA"]: - if mpi_rank != 0: - out = numpy_ma_empty( - out_props["shape"], dtype=out_props["dtype"] - ) - # --- End: if - mpi_comm.Bcast(out.data, root=0) - else: - if mpi_rank != 0: - out = numpy_empty( - out_props["shape"], dtype=out_props["dtype"] - ) - # --- End: if - mpi_comm.Bcast(out, root=0) - # --- End: if - else: - # In the case that the inner loop is not parallelised, - # just finalise. - out = self._collapse_finalise( - ffinalise, - out, - sub_samples, - masked, - Nmax, - mtol, - data, - n_non_collapse_axes, - ) - # --- End: if + # In the case that the inner loop is not parallelised, + # just finalise. + out = self._collapse_finalise( + ffinalise, + out, + sub_samples, + masked, + Nmax, + mtol, + data, + n_non_collapse_axes, + ) + # # --- End: if return out @@ -6907,7 +5528,7 @@ def _collapse_finalise( out = cls._collapse_mask(out, masked, N, Nmax, mtol) else: # no data - return all masked - out = numpy_ma_masked_all( + out = np.ma.masked_all( data.shape[:n_non_collapse_axes], data.dtype ) @@ -6937,7 +5558,7 @@ def _collapse_mask(array, masked, N, Nmax, mtol): if masked and mtol < 1: x = N < (1 - mtol) * Nmax if x.any(): - array = numpy_ma_masked_where(x, array, copy=False) + array = np.ma.masked_where(x, array, copy=False) # --- End: if return array @@ -7022,7 +5643,7 @@ def _collapse_create_weights( zero_weights = zero_weights or (weight.min() <= 0) - masked = masked or numpy_ma_isMA(weight) + masked = masked or np.ma.isMA(weight) if weight.ndim != array_ndim: # Make sure that the weight has the same number of @@ -7068,7 +5689,7 @@ def _collapse_create_weights( # axes or b) The weights contain masked values weights_out = broadcast_array(weights_out, array_shape) - if masked and numpy_ma_isMA(array): + if masked and np.ma.isMA(array): if not (array.mask | weights_out.mask == array.mask).all(): raise ValueError( "The output weights mask {} is not compatible with " @@ -7118,60 +5739,13 @@ def _collapse_optimize_weights(self, weights): return weights - def _new_axis_identifier(self, existing_axes=None): - """Return an axis name not being used by the data array. - - The returned axis name will also not be referenced by partitions - of the partition matrix. - - :Parameters: - - existing_axes: sequence of `str`, optional - - :Returns: - - `str` - The new axis name. - - **Examples:** - - >>> d._all_axis_names() - ['dim1', 'dim0'] - >>> d._new_axis_identifier() - 'dim2' - - >>> d._all_axis_names() - ['dim1', 'dim0', 'dim3'] - >>> d._new_axis_identifier() - 'dim4' - - >>> d._all_axis_names() - ['dim5', 'dim6', 'dim7'] - >>> d._new_axis_identifier() - 'dim3' - - """ - if existing_axes is None: - existing_axes = self._all_axis_names() - - n = len(existing_axes) - axis = "dim%d" % n - while axis in existing_axes: - n += 1 - axis = "dim%d" % n - - return axis - # ---------------------------------------------------------------- # Private attributes # ---------------------------------------------------------------- @property def _Units(self): - """Storage for the units.""" - try: - return self._custom["_Units"] - except KeyError: - raise AttributeError() + """Storage for the units.""" + return self._custom["_Units"] @_Units.setter def _Units(self, value): @@ -7182,21 +5756,21 @@ def _Units(self): self._custom["_Units"] = _units_None @property - def _auxiliary_mask(self): - """Storage for the auxiliary mask.""" - return self._custom["_auxiliary_mask"] + def _cyclic(self): + """Storage for axis cyclicity. - @_auxiliary_mask.setter - def _auxiliary_mask(self, value): - self._custom["_auxiliary_mask"] = value + Contains a `set` that identifies which axes are cyclic (and + therefore allow cyclic slicing). The set contains a subset of + the axis identifiers defined by the `_axes` attribute. - @_auxiliary_mask.deleter - def _auxiliary_mask(self): - del self._custom["_auxiliary_mask"] + .. warning:: Never change the value of the `_cyclic` attribute + in-place. - @property - def _cyclic(self): - """Storage for axis cyclicity.""" + .. note:: When an axis identifier is removed from the `_axes` + attribute then it is automatically also removed from + the `_cyclic` attribute. + + """ return self._custom["_cyclic"] @_cyclic.setter @@ -7205,20 +5779,7 @@ def _cyclic(self, value): @_cyclic.deleter def _cyclic(self): - del self._custom["_cyclic"] - - @property - def _dtype(self): - """Storage for the data type.""" - return self._custom["_dtype"] - - @_dtype.setter - def _dtype(self, value): - self._custom["_dtype"] = value - - @_dtype.deleter - def _dtype(self): - del self._custom["_dtype"] + self._custom["_cyclic"] = _empty_set @property def _HDF_chunks(self): @@ -7238,150 +5799,139 @@ def _HDF_chunks(self): del self._custom["_HDF_chunks"] @property - def partitions(self): - """Storage for the partitions matrix.""" - return self._custom["partitions"] - - @partitions.setter - def partitions(self, value): - self._custom["partitions"] = value - - @partitions.deleter - def partitions(self): - del self._custom["partitions"] - - @property - def _ndim(self): - """Storage for the number of dimensions.""" - return self._custom["_ndim"] + @daskified(daskified_log_level) + def _hardmask(self): + """Storage for the mask hardness. - @_ndim.setter - def _ndim(self, value): - self._custom["_ndim"] = value + Contains a `bool`, where `True` denotes a hard mask and + `False` denotes a soft mask. - @_ndim.deleter - def _ndim(self): - del self._custom["_ndim"] - - @property - def _size(self): - """Storage for the number of elements.""" - return self._custom["_size"] + See `hardmask` for details. - @_size.setter - def _size(self, value): - self._custom["_size"] = value + """ + return self._custom["_hardmask"] - @_size.deleter - def _size(self): - del self._custom["_size"] + @_hardmask.setter + def _hardmask(self, value): + self._custom["_hardmask"] = value @property - def _shape(self): - """Storage for the data shape.""" - return self._custom["_shape"] + @daskified(daskified_log_level) + def _axes(self): + """Storage for the axis identifiers. - @_shape.setter - def _shape(self, value): - self._custom["_shape"] = value + Contains a `tuple` of identifiers, one for each array axis. - @_shape.deleter - def _shape(self): - del self._custom["_shape"] + .. note:: When the axis identifiers are reset, then any axis + identifier named by the `_cyclic` attribute which is + not in the new `_axes` set is automatically removed + from the `_cyclic` attribute. - @property - def _axes(self): - """Storage for the axes names.""" + """ return self._custom["_axes"] @_axes.setter def _axes(self, value): - self._custom["_axes"] = value + self._custom["_axes"] = tuple(value) - @_axes.deleter - def _axes(self): - del self._custom["_axes"] + # Remove cyclic axes that are not in the new axes + cyclic = self._cyclic + if cyclic: + # Never change the value of the _cyclic attribute in-place + self._cyclic = cyclic.intersection(value) + # ---------------------------------------------------------------- + # Dask attributes + # ---------------------------------------------------------------- @property - def _all_axes(self): - """Storage for the full collection of axes names. - - :Returns: - - `None` or `tuple`. - - """ - return self._custom["_all_axes"] - - @_all_axes.setter - def _all_axes(self, value): - self._custom["_all_axes"] = value + def chunks(self): + """TODODASK.""" + return self.get_dask(copy=False).chunks - @_all_axes.deleter - def _all_axes(self): - del self._custom["_all_axes"] + @property + def force_compute(self): + """TODODASK See also confg settings.""" + return self._custom.get("force_compute", False) - def _flip(self, *flip): - """TODO.""" - if flip: - self._custom["flip"] = flip[0] - else: - return self._custom["flip"] + @force_compute.setter + def force_compute(self, value): + self._custom["force_compute"] = bool(value) # ---------------------------------------------------------------- # Attributes # ---------------------------------------------------------------- @property + @daskified(daskified_log_level) def Units(self): """The `cf.Units` object containing the units of the data array. - Deleting this attribute is equivalent to setting it to an - undefined units object, so this attribute is guaranteed to always - exist. + Can be set to any units equivalent to the existing units. + + .. seealso `override_units`, `override_calendar` **Examples:** - >>> d.Units = Units('m') + >>> d = cf.Data([1, 2, 3], units='m') >>> d.Units - >>> del d.Units + >>> d.Units = cf.Units('kilmetres') + >>> d.Units + + >>> d.Units = cf.Units('km') >>> d.Units - + """ return self._Units @Units.setter def Units(self, value): - units = getattr(self, "_Units", _units_None) - if units and not self._Units.equivalent(value, verbose=1): - raise ValueError( - "Can't set units (currently {!r}) to non-equivalent " - "units {!r}. Consider the override_units method.".format( - units, value + try: + old_units = self._Units + except KeyError: + pass + else: + if not old_units.equivalent(value): + raise ValueError( + f"Can't set to Units to {value!r} that are not " + f"equivalent to the current units {old_units!r}. " + "Consider using the override_units method instead." ) - ) + + if not old_units: + self.override_units(value, inplace=True) + return + + if self.Units.equals(value): + return dtype = self.dtype + if dtype.kind in "iu": + if dtype.char in "iI": + dtype = _dtype_float32 + else: + dtype = _dtype_float - if dtype is not None: - if dtype.kind == "i": - char = dtype.char - if char == "i": - old_units = getattr(self, "_Units", None) - if old_units is not None and not old_units.equals(value): - self.dtype = "float32" - elif char == "l": - old_units = getattr(self, "_Units", None) - if old_units is not None and not old_units.equals(value): - self.dtype = float - # --- End: if + def cf_Units(x): + return Units.conform( + x=x, from_units=old_units, to_units=value, inplace=False + ) + + self._map_blocks( + cf_Units, + delete_source=False, + reset_mask_hardness=False, + dtype=dtype, + ) self._Units = value @Units.deleter def Units(self): - del self._Units # = _units_None + raise ValueError( + "Can't delete the Units attribute. " + "Consider using the override_units method instead." + ) @property def data(self): @@ -7396,33 +5946,13 @@ def data(self): return self @property + @daskified(daskified_log_level) def dtype(self): - """The `numpy` data-type of the data array. - - By default this is the data-type with the smallest size and - smallest scalar kind to which all sub-arrays of the master data - array may be safely cast without loss of information. For example, - if the sub-arrays have data-types 'int64' and 'float32' then the - master data array's data-type will be 'float64'; or if the - sub-arrays have data-types 'int64' and 'int32' then the master - data array's data-type will be 'int64'. - - Setting the data-type to a `numpy.dtype` object, or any object - convertible to a `numpy.dtype` object, will cause the master data - array elements to be recast to the specified type at the time that - they are next accessed, and not before. This does not immediately - change the master data array elements, so, for example, - reinstating the original data-type prior to data access results in - no loss of information. - - Deleting the data-type forces the default behaviour. Note that if - the data-type of any sub-arrays has changed after `dtype` has been - set (which could occur if the data array is accessed) then the - reinstated default data-type may be different to the data-type - prior to `dtype` being set. + """The `numpy` data-type of the data. **Examples:** + TODODASK >>> d = cf.Data([0.5, 1.5, 2.5]) >>> d.dtype dtype(float64') @@ -7449,39 +5979,18 @@ def dtype(self): [ 0.5 1.5 2.5] """ - datatype = self._dtype - if datatype is None: - config = self.partition_configuration(readonly=True) - - flat = self.partitions.matrix.flat - - partition = next(flat) - datatype = partition.subarray.dtype - if datatype is None: - partition.open(config) - datatype = partition.array.dtype - partition.close() - - for partition in flat: - array = partition.subarray - if array.dtype is None: - partition.open(config) - array = partition.array - partition.close() - - datatype = numpy_result_type(datatype, array) - - self._dtype = datatype - - return datatype + dx = self.get_dask(copy=False) + return dx.dtype @dtype.setter def dtype(self, value): - self._dtype = numpy_dtype(value) + dx = self.get_dask(copy=False) - @dtype.deleter - def dtype(self): - self._dtype = None + # Only change the datatype if it's different to that of the + # dask array + if dx.dtype != value: + dx = dx.astype(value) + self._set_dask(dx, reset_mask_hardness=False) @property def fill_value(self): @@ -7514,97 +6023,95 @@ def fill_value(self): self.del_fill_value(None) @property + @daskified(daskified_log_level) def hardmask(self): - """Whether the mask is hard (True) or soft (False). + """Hardness of the mask. - When the mask is hard, masked entries of the data array can not be - unmasked by assignment, but unmasked entries may still be masked. + If the `hardmask` attribute is `True`, i.e. there is a hard + mask, then unmasking an entry will silently not occur. This is + the default, and prevents overwriting the mask. - When the mask is soft, masked entries of the data array may be - unmasked by assignment and unmasked entries may be masked. + If the `hardmask` attribute is `False`, i.e. there is a soft + mask, then masked entries may be overwritten with non-missing + values. - By default, the mask is hard. + To allow the unmasking of masked values, the mask must be + softened by setting the `hardmask` attribute to False, or + equivalently with the `soften_mask` method. + + The mask can be hardened by setting the `hardmask` attribute + to True, or equivalently with the `harden_mask` method. + + .. seealso:: `harden_mask`, `soften_mask`, `where`, + `__setitem__` **Examples:** + >>> d = cf.Data([1, 2, 3]) + >>> d.hardmask + True + >>> d[0] = cf.masked + >>> print(d.array) + [-- 2 3] + >>> d[...]= 999 + >>> print(d.array) + [-- 999 999] >>> d.hardmask = False >>> d.hardmask False + >>> d[...] = -1 + >>> print(d.array) + [-1 -1 -1] """ - return self._custom["hardmask"] + return self._hardmask @hardmask.setter def hardmask(self, value): - self._custom["hardmask"] = bool(value) - - @hardmask.deleter - def hardmask(self): - raise AttributeError( - "Can't delete {} attribute 'hardmask'".format( - self.__class__.__name__ - ) - ) + if value: + self.harden_mask() + else: + self.soften_mask() @property - def ismasked(self): + @daskified(daskified_log_level) + def is_masked(self): """True if the data array has any masked values. - **Examples:** + **Performance** + + `is_masked` causes all delayed operations to be executed. + + **Examples** >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) - >>> print(d.ismasked) + >>> print(d.is_masked) False >>> d[0, ...] = cf.masked - >>> d.ismasked + >>> d.is_masked True """ - if self._auxiliary_mask: - for m in self._auxiliary_mask: - if m.any(): - # Found a masked element - return True - # --- End: for - - # Still here? Then remove the auxiliary mask because it - # must be all False. - self._auxiliary_mask = None - - # Still here? - config = self.partition_configuration(readonly=True) - - for partition in self.partitions.matrix.flat: - partition.open(config) - partition.array - if partition.masked: - # Found a masked element - partition.close() - return True - - partition.close() - - # There are no masked elements - return False - @property - def ispartitioned(self): - """True if the data array is partitioned. + def is_masked(a): + out = np.ma.is_masked(a) + return np.array(out).reshape((1,) * a.ndim) - **Examples:** + dx = self.get_dask(copy=False) - >>> d._pmsize - 1 - >>> d.ispartitioned - False + out_ind = tuple(range(dx.ndim)) + dx_ind = out_ind - >>> d._pmsize - 2 - >>> d.ispartitioned - False + dx = da.blockwise( + is_masked, + out_ind, + dx, + dx_ind, + adjust_chunks={i: 1 for i in out_ind}, + dtype=bool, + ) - """ - return self._pmsize > 1 + return bool(dx.any()) @property def isscalar(self): @@ -7623,14 +6130,20 @@ def isscalar(self): False """ - return not self._ndim + return not self.ndim @property + @daskified(daskified_log_level) def nbytes(self): """Total number of bytes consumed by the elements of the array. Does not include bytes consumed by the array mask + **Performance** + + If the number of bytes is unknown then it is calculated + immediately by executing all delayed operations. + **Examples:** >>> d = cf.Data([[1, 1.5, 2]]) @@ -7647,9 +6160,15 @@ def nbytes(self): 24 """ - return self._size * self.dtype.itemsize + dx = self.get_dask(copy=False) + if math.isnan(dx.size): + logger.warning("Computing nbytes: Performance may be degraded") + dx.compute_chunk_sizes() + + return dx.nbytes @property + @daskified(daskified_log_level) def ndim(self): """Number of dimensions in the data array. @@ -7676,72 +6195,19 @@ def ndim(self): 0 """ - return self._ndim - - @property - def _pmaxes(self): - """The axes of the partition matrix.""" - return self.partitions.axes - - @property - def _pmndim(self): - """Number of dimensions in the partition matrix. - - **Examples:** - - >>> d._pmshape - (4, 7) - >>> d._pmndim - 2 - - >>> d._pmshape - () - >>> d._pmndim - 0 - - """ - return self.partitions.ndim - - @property - def _pmsize(self): - """Number of partitions in the partition matrix. - - **Examples:** - - >>> d._pmshape - (4, 7) - >>> d._pmsize - 28 - - >>> d._pmndim - 0 - >>> d._pmsize - 1 - - """ - return self.partitions.size - - @property - def _pmshape(self): - """Tuple of the partition matrix's dimension sizes. - - **Examples:** - - >>> d._pmshape - (4, 7) - - >>> d._pmndim - 0 - >>> d._pmshape - () - - """ - return self.partitions.shape + dx = self.get_dask(copy=False) + return dx.ndim @property + @daskified(daskified_log_level) def shape(self): """Tuple of the data array's dimension sizes. + **Performance** + + If the shape of the data is unknown then it is calculated + immediately by executing all delayed operations. + **Examples:** >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) @@ -7761,19 +6227,23 @@ def shape(self): () """ - try: - return self._shape - except Exception: - raise AttributeError( - "{!r} object has no attribute 'shape'".format( - self.__class__.__name__ - ) - ) + dx = self.get_dask(copy=False) + if math.isnan(dx.size): + logger.warning("Computing data shape: Performance may be degraded") + dx.compute_chunk_sizes() + + return dx.shape @property + @daskified(daskified_log_level) def size(self): """Number of elements in the data array. + **Performance** + + If the size of the data is unknown then it is calculated + immediately by executing all delayed operations. + **Examples:** >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) @@ -7797,20 +6267,32 @@ def size(self): 1 """ - return self._size + dx = self.get_dask(copy=False) + size = dx.size + if math.isnan(size): + logger.warning("Computing data size: Performance may be degraded") + dx.compute_chunk_sizes() + size = dx.size + + return size @property + @daskified(daskified_log_level) def array(self): """A numpy array copy the data array. - .. note:: If the data array is stored as date-time objects then a - numpy array of numeric reference times will be - returned. A numpy array of date-time objects may be - returned by the `datetime_array` attribute. + .. note:: If the data array is stored as date-time objects then a + numpy array of numeric reference times will be + returned. A numpy array of date-time objects may be + returned by the `datetime_array` attribute. - .. seealso:: `datetime_array`, `varray` + **Performance** - **Examples:** + `array` causes all delayed operations to be computed. + + .. seealso:: `datetime_array`, `varray` + + **Examples:** >>> d = cf.Data([1, 2, 3.0], 'km') >>> a = d.array @@ -7826,313 +6308,88 @@ def array(self): -99.0 km """ - # Set the auxiliary_mask keyword to None because we can apply - # it later in one go - config = self.partition_configuration( - readonly=True, auxiliary_mask=None - ) - - out_data_type = self.dtype - units = self.Units - - _dtarray = getattr(self, "_dtarray", False) - - if _dtarray: - del self._dtarray - out_data_type = _dtype_object - # if self._isdatetime(): - # pda_args['func'] = None - # elif self._isdatetime(): - # out_data_type = numpy_dtype(float) - # pda_args['func'] = dt2rt - # # Turn off data-type checking and partition updating - # pda_args['dtype'] = None - # pda_args['update'] = False - - partitions = self.partitions - - # Still here? - array_out = numpy_empty(self._shape, dtype=out_data_type) - - masked = False - - if not self.ndim: - # -------------------------------------------------------- - # array_out is a scalar array so index it with Ellipsis - # (as opposed to the empty tuple which would be returned - # from partition.indices). This prevents bad behaviour - # when p_array is a numpy array of objects (e.g. data-time - # objects). - # -------------------------------------------------------- - partition = partitions.matrix[()] - partition.open(config) - p_array = partition.array - - # copy okect? - - if _dtarray: - if not partition.isdt: - # Convert the partition subarray to an array - # of date-time objects - p_array = rt2dt(p_array, units) - elif partition.isdt: - # Convert the partition subarray to an array of - # reference time floats - p_array = dt2rt(p_array, None, units) - - if not masked and partition.masked: - array_out = array_out.view(numpy_ma_MaskedArray) - array_out.set_fill_value(self.get_fill_value(None)) - masked = True - - array_out[...] = p_array - partition.close() - - else: - # -------------------------------------------------------- - # array_out is not a scalar array, so it can safely be - # indexed with partition.indices in all cases. - # -------------------------------------------------------- - for partition in partitions.matrix.flat: - partition.open(config) - p_array = partition.array - - if _dtarray: - if not partition.isdt: - # Convert the partition subarray to an array - # of date-time objects - p_array = rt2dt(p_array, units) - elif partition.isdt: - # Convert the partition subarray to an array of - # reference time floats - p_array = dt2rt(p_array, None, units) - - # copy okect? - - if not masked and partition.masked: - array_out = array_out.view(numpy_ma_MaskedArray) - array_out.set_fill_value(self.get_fill_value(None)) - masked = True - - array_out[partition.indices] = p_array + dx = self.get_dask(copy=False) + a = dx.compute() - partition.close() - # --- End: for - - # ------------------------------------------------------------ - # Apply the auxiliary mask - # ------------------------------------------------------------ - if self._auxiliary_mask: - if not masked: - # Convert the output array to a masked array - array_out = array_out.view(numpy_ma_MaskedArray) - array_out.set_fill_value(self.get_fill_value(None)) - masked = True - - self._auxiliary_mask_tidy() - - for mask in self._auxiliary_mask: - array_out.mask = array_out.mask | mask.array - - if array_out.mask is numpy_ma_nomask: - # There are no masked points, so convert the array - # back to a non-masked array. - array_out = array_out.data - masked = False - # --- End: if - - if masked and self.hardmask: - # Harden the mask of the output array - array_out.harden_mask() + if np.ma.isMA(a): + if self.hardmask: + a.harden_mask() + else: + a.soften_mask() - return array_out + return a @property + @daskified(daskified_log_level) def datetime_array(self): """An independent numpy array of date-time objects. - Only applicable to data arrays with reference time units. - - If the calendar has not been set then the CF default calendar will - be used and the units will be updated accordingly. - - The data-type of the data array is unchanged. - - .. seealso:: `array`, `varray` - - **Examples:** - - """ - if not self.Units.isreftime: - raise ValueError( - "Can't create date-time array from units " - "{!r}".format(self.Units) - ) - - if getattr(self.Units, "calendar", None) == "none": - raise ValueError( - "Can't create date-time array from units {!r} because " - "calendar is 'none'".format(self.Units) - ) - - units, reftime = self.Units.units.split(" since ") - - d = self - - # Convert months and years to days, because cftime won't work - # otherwise. - if units in ("months", "month"): - d = self * _month_length - d.override_units( - Units( - "days since " + reftime, - calendar=getattr(self.Units, "calendar", None), - ), - inplace=True, - ) - elif units in ("years", "year", "yr"): - d = self * _year_length - d.override_units( - Units( - "days since " + reftime, - calendar=getattr(self.Units, "calendar", None), - ), - inplace=True, - ) - - d._dtarray = True - return d.array - - @property - def varray(self): - """A numpy array view the data array. - - Note that making changes to elements of the returned view changes - the underlying data. - - .. seealso:: `array`, `datetime_array` - - **Examples:** - - >>> a = d.varray - >>> type(a) - - >>> a - array([0, 1, 2, 3, 4]) - >>> a[0] = 999 - >>> d.varray - array([999, 1, 2, 3, 4]) - - """ - config = self.partition_configuration(readonly=False) - - data_type = self.dtype - - if getattr(self, "_dtarray", False): - del self._dtarray - elif self._isdatetime(): # self._isdt: - data_type = numpy_dtype(float) - config["func"] = dt2rt - # Turn off data-type checking and partition updating - config["dtype"] = None - - if self.partitions.size == 1: - # If there is only one partition, then we can return a - # view of the partition's data array without having to - # create an empty array and then filling it up partition - # by partition. - partition = self.partitions.matrix.item() - partition.open(config) - array = partition.array - # Note that there is no need to close the partition here. - self._dtype = data_type - - # source = self.source(None) - # if source is not None and source.get_compression_type(): - # self._del_Array(None) - - # Flip to []? - return array - - # Still here? - shape = self._shape - array_out = numpy_empty(shape, dtype=data_type) - masked = False - - config["readonly"] = True + Only applicable to data arrays with reference time units. - for partition in self.partitions.matrix.flat: - partition.open(config) - p_array = partition.array - - if not masked and partition.masked: - array_out = array_out.view(numpy_ma_MaskedArray) - array_out.set_fill_value(self.get_fill_value(None)) - masked = True - - array_out[partition.indices] = p_array + If the calendar has not been set then the CF default calendar will + be used and the units will be updated accordingly. - # Note that there is no need to close the partition here - # --- End: for - - # ------------------------------------------------------------ - # Apply an auxiliary mask - # ------------------------------------------------------------ - if self._auxiliary_mask: - if not masked: - # Convert the output array to a masked array - array_out = array_out.view(numpy_ma_MaskedArray) - array_out.set_fill_value(self.get_fill_value(None)) - masked = True + The data-type of the data array is unchanged. - self._auxiliary_mask_tidy() + .. seealso:: `array` - for mask in self._auxiliary_mask: - array_out.mask = array_out.mask | mask.array + **Examples:** - if array_out.mask is numpy_ma_nomask: - # There are no masked points, so convert back to a - # non-masked array. - array_out = array_out.data - masked = False + **Performance** - self._auxiliary_mask = None - # --- End: if + `datetime_array` causes all delayed operations to be computed. - if masked and self.hardmask: - # Harden the mask of the output array - array_out.harden_mask() + """ + units = self.Units - # matrix = _xxx.copy() + if not units.isreftime: + raise ValueError( + f"Can't create date-time array from units {self.Units!r}" + ) - if not array_out.ndim and not isinstance(array_out, numpy_ndarray): - array_out = numpy_asanyarray(array_out) + if getattr(units, "calendar", None) == "none": + raise ValueError( + f"Can't create date-time array from units {self.Units!r} " + "because calendar is 'none'" + ) - self._set_partition_matrix( - array_out, chunk=False, check_free_memory=False - ) + units, reftime = units.units.split(" since ") - # matrix[()] = Partition(subarray = array_out, - # location = [(0, n) for n in shape], - # axes = self._axes, - # flip = [], - # shape = list(shape), - # Units = self.Units, - # part = [] - # ) - # - # self.partitions = PartitionMatrix(matrix, []) + # Convert months and years to days, because cftime won't work + # otherwise. + if units in ("months", "month"): + d = self * _month_length + d.override_units( + Units( + f"days since {reftime}", + calendar=getattr(units, "calendar", None), + ), + inplace=True, + ) + elif units in ("years", "year", "yr"): + d = self * _year_length + d.override_units( + Units( + f"days since {reftime}", + calendar=getattr(units, "calendar", None), + ), + inplace=True, + ) + else: + d = self - self._dtype = data_type + dx = d.get_dask(copy=False) + dx = convert_to_datetime(dx, d.Units) # TODODASK - # self._flip = [] - self._flip([]) + a = dx.compute() - # source = self.source(None) - # if source is not None and source.get_compression_type(): - # self._del_Array(None) + if np.ma.isMA(a): + if self.hardmask: + a.harden_mask() + else: + a.soften_mask() - return array_out + return a @property def mask(self): @@ -8184,7 +6441,7 @@ def mask(self): mask._Units = _units_None mask.dtype = _dtype_bool - mask.hardmask = True + mask._hardmask = True return mask @@ -8475,7 +6732,7 @@ def arctan(self, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - d.func(numpy_arctan, units=_units_radians, inplace=True) + d.func(np.arctan, units=_units_radians, inplace=True) return d @@ -8564,7 +6821,7 @@ def arctanh(self, inplace=False): # preserve_invalid necessary because arctanh has a restricted domain d.func( - numpy_arctanh, + np.arctanh, units=_units_radians, inplace=True, preserve_invalid=True, @@ -8616,7 +6873,7 @@ def arcsin(self, inplace=False): # preserve_invalid necessary because arcsin has a restricted domain d.func( - numpy_arcsin, + np.arcsin, units=_units_radians, inplace=True, preserve_invalid=True, @@ -8664,7 +6921,7 @@ def arcsinh(self, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - d.func(numpy_arcsinh, units=_units_radians, inplace=True) + d.func(np.arcsinh, units=_units_radians, inplace=True) return d @@ -8713,7 +6970,7 @@ def arccos(self, inplace=False): # preserve_invalid necessary because arccos has a restricted domain d.func( - numpy_arccos, + np.arccos, units=_units_radians, inplace=True, preserve_invalid=True, @@ -8765,7 +7022,7 @@ def arccosh(self, inplace=False): # preserve_invalid necessary because arccosh has a restricted domain d.func( - numpy_arccosh, + np.arccosh, units=_units_radians, inplace=True, preserve_invalid=True, @@ -8773,30 +7030,6 @@ def arccosh(self, inplace=False): return d - def add_partitions(self, extra_boundaries, pdim): - """Add partition boundaries. - - :Parameters: - - extra_boundaries: `list` of `int` - The boundaries of the new partitions. - - pdim: `str` - The name of the axis to have the new partitions. - - :Returns: - - `None` - - **Examples:** - - >>> d.add_partitions( ) - - """ - self.partitions.add_partitions( - self._axes, self._flip(), extra_boundaries, pdim - ) - def all(self): """Test whether all data array elements evaluate to True. @@ -8840,7 +7073,7 @@ def all(self): partition.open(config) array = partition.array a = array.all() - if not a and a is not numpy_ma_masked: + if not a and a is not np.ma.masked: partition.close() return False @@ -9312,7 +7545,7 @@ def argmax(self, axis=None, unravel=False): for partition in self.partitions.matrix.flat: partition.open(config) array = partition.array - index = numpy_unravel_index(array.argmax(), array.shape) + index = np.unravel_index(array.argmax(), array.shape) mx = array[index] index = [x[0] + i for x, i in zip(partition.location, index)] out.append((mx, index)) @@ -9323,7 +7556,7 @@ def argmax(self, axis=None, unravel=False): if unravel: return tuple(index) - return numpy_ravel_multi_index(index, self.shape) + return np.ravel_multi_index(index, self.shape) # Parse axis ndim = self._ndim @@ -9338,7 +7571,7 @@ def argmax(self, axis=None, unravel=False): sections = self.section(axis, chunks=True) for key, d in sections.items(): array = d.varray.argmax(axis=axis) - array = numpy_expand_dims(array, axis) + array = np.expand_dims(array, axis) sections[key] = type(self)( array, self.Units, fill_value=self.fill_value ) @@ -9424,7 +7657,7 @@ def get_calendar(self, default=ValueError()): """ try: return self.Units.calendar - except AttributeError: + except (AttributeError, KeyError): return super().get_calendar(default=default) def set_calendar(self, calendar): @@ -9455,40 +7688,6 @@ def set_calendar(self, calendar): """ self.Units = Units(self.get_units(default=None), calendar) - # def set_fill_value(self, value): - # '''Set the missing data value. - # - # .. seealso:: `del_fill_value`, `get_fill_vlaue` - # - # :Parameters: - # - # value: scalar - # The new fill value. - # - # :Returns: - # - # `None` - # - # **Examples:** - # - # >>> f.set_fill_value(-9999) - # >>> f.get_fill_value() - # -9999 - # >>> print(f.del_fill_value()) - # -9999 - # >>> f.get_fill_value() - # ValueError: Can't get non-existent fill value - # >>> f.get_fill_value(10**10) - # 10000000000 - # >>> print(f.get_fill_value(None)) - # None - # >>> f.set_fill_value(None) - # >>> print(f.get_fill_value()) - # None - # - # ''' - # self._fill_value = value - def set_units(self, value): """Set the units. @@ -9541,8 +7740,6 @@ def maximum( {{inplace: `bool`, optional}} - {{i: deprecated at version 3.0.0}} - :Returns: `Data` or `None` @@ -9551,16 +7748,17 @@ def maximum( **Examples:** """ - return self._collapse( - max_f, - max_fpartial, - max_ffinalise, - axes=axes, - squeeze=squeeze, - mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, - ) + # TODODASK: Placeholder for the real thing, that takes into + # account axes=axes, squeeze=squeeze, mtol=mtol, + # inplace=inplace. + # + # This is only here for now, in this form, to ensure that + # cf.read works + return self.get_dask(copy=False).max() + + # return self._collapse(max_f, max_fpartial, max_ffinalise, axes=axes, + # squeeze=squeeze, mtol=mtol, inplace=inplace, + # _preserve_partitions=_preserve_partitions) def maximum_absolute_value( self, @@ -10114,10 +8312,10 @@ def binary_mask(self): array = array.astype(bool) if partition.masked: # data is masked - partition.subarray = numpy_ma_array(array, "int32") + partition.subarray = np.ma.array(array, "int32") else: # data is not masked - partition.subarray = numpy_array(array, "int32") + partition.subarray = np.array(array, "int32") partition.Units = _units_1 @@ -10244,13 +8442,12 @@ def asdata(cls, d, dtype=None, copy=False): data = data() if copy: data = data.copy() - if dtype is not None and numpy_dtype(dtype) != data.dtype: + if dtype is not None and np.dtype(dtype) != data.dtype: data.dtype = dtype else: - if dtype is not None and numpy_dtype(dtype) != data.dtype: + if dtype is not None and np.dtype(dtype) != data.dtype: data = data.copy() data.dtype = dtype - # --- End: if return data @@ -10269,6 +8466,7 @@ def close(self): >>> d.close() """ + print("TODODASK - is this still needed/valid? Not needed") for partition in self.partitions.matrix.flat: partition.file_close() @@ -10345,7 +8543,7 @@ def compressed(self, inplace=False): break array = d[i : i + n].array - if numpy_ma_isMA(array): + if np.ma.isMA(array): array = array.compressed() size = array.size @@ -10418,7 +8616,7 @@ def cos(self, inplace=False, i=False): if d.Units.equivalent(_units_radians): d.Units = _units_radians - d.func(numpy_cos, units=_units_1, inplace=True) + d.func(np.cos, units=_units_1, inplace=True) return d @@ -10456,11 +8654,12 @@ def count(self): 8 """ + # TODODASK - daskify, previously parallelise=mpi_on (not =False) config = self.partition_configuration(readonly=True) n = 0 - self._flag_partitions_for_processing(parallelise=mpi_on) + # self._flag_partitions_for_processing(parallelise=mpi_on) processed_partitions = [] for pmindex, partition in self.partitions.ndenumerate(): @@ -10468,7 +8667,7 @@ def count(self): partition.open(config) partition._pmindex = pmindex array = partition.array - n += numpy_ma_count(array) + n += np.ma.count(array) partition.close() processed_partitions.append(partition) # --- End: if @@ -10481,7 +8680,7 @@ def count(self): # are distributed to every rank and processed_partitions now # contains all the processed partitions from every rank. processed_partitions = self._share_partitions( - processed_partitions, parallelise=mpi_on + processed_partitions, parallelise=False ) # Put the processed partitions back in the partition matrix @@ -10494,12 +8693,12 @@ def count(self): # Share the lock files created by each rank for each partition # now in a temporary file so that __del__ knows which lock # files to check if present - self._share_lock_files(parallelise=mpi_on) + self._share_lock_files(parallelise=False) # Aggregate the results on each process and return on all # processes - if mpi_on: - n = mpi_comm.allreduce(n, op=mpi_sum) + # if mpi_on: + # n = mpi_comm.allreduce(n, op=mpi_sum) # --- End: if return n @@ -10536,21 +8735,14 @@ def cyclic(self, axes=None, iscyclic=True): if axes is None: return old - parsed_axes = self._parse_axes(axes) - axes = [data_axes[i] for i in parsed_axes] + axes = [data_axes[i] for i in self._parse_axes(axes)] + # Never change the value of the _cyclic attribute in-place if iscyclic: self._cyclic = cyclic_axes.union(axes) else: self._cyclic = cyclic_axes.difference(axes) - # Make sure that the auxiliary mask has the same cyclicity - auxiliary_mask = self._custom.get("_auxiliary_mask") - if auxiliary_mask is not None: - self._auxiliary_mask = [mask.copy() for mask in auxiliary_mask] - for mask in self._auxiliary_mask: - mask.cyclic(parsed_axes, iscyclic) - return old def _YMDhms(self, attr): @@ -10815,7 +9007,7 @@ def unique(self): for partition in self.partitions.matrix.flat: partition.open(config) array = partition.array - array = numpy_unique(array) + array = np.unique(array) if partition.masked: # Note that compressing a masked array may result in @@ -10830,7 +9022,7 @@ def unique(self): partition.close() - u = numpy.unique(numpy_array(u, dtype=self.dtype)) + u = np.unique(np.array(u, dtype=self.dtype)) return type(self)(u, units=self.Units) @@ -10895,27 +9087,8 @@ def ndindex(self): **Examples:** - >>> d.shape - (2, 1, 3) - >>> for i in d.ndindex(): - ... print(i) - ... - (0, 0, 0) - (0, 0, 1) - (0, 0, 2) - (1, 0, 0) - (1, 0, 1) - (1, 0, 2) - - > d.shape - () - >>> for i in d.ndindex(): - ... print(i) - ... - () - """ - return itertools.product(*[range(0, r) for r in self._shape]) + return product(*[range(0, r) for r in self.shape]) @_deprecated_kwarg_check("traceback") @_manage_log_level_via_verbosity @@ -11055,16 +9228,16 @@ def exp(self, inplace=False, i=False): if d.Units: d.Units = _units_1 - d.func(numpy_exp, inplace=True) + d.func(np.exp, inplace=True) return d + @daskified(daskified_log_level) @_inplace_enabled(default=False) def insert_dimension(self, position=0, inplace=False): """Expand the shape of the data array in place. - Insert a new size 1 axis, corresponding to a given position in the - data array shape. + # TODODASK bring back expand_dime alias (or rather alias this to that) .. seealso:: `flip`, `squeeze`, `swapaxes`, `transpose` @@ -11087,47 +9260,29 @@ def insert_dimension(self, position=0, inplace=False): d = _inplace_enabled_define_and_cleanup(self) # Parse position - ndim = self._ndim + if not isinstance(position, int): + raise ValueError("Position parameter must be an integer") + + ndim = d.ndim if -ndim - 1 <= position < 0: position += ndim + 1 elif not 0 <= position <= ndim: raise ValueError( - "Can't insert dimension: Invalid position (%d)" % position + f"Can't insert dimension: Invalid position {position!r}" ) - # Expand _axes - axis = d._new_axis_identifier() - data_axes = d._axes[:] - data_axes.insert(position, axis) - d._axes = data_axes - - # Increment ndim and expand shape - d._ndim += 1 - shape = list(d._shape) + shape = list(d.shape) shape.insert(position, 1) - d._shape = tuple(shape) - - # Expand the location and shape of each partition - location = (0, 1) - for partition in d.partitions.matrix.flat: - partition.location = partition.location[:] - partition.shape = partition.shape[:] - - partition.location.insert(position, location) - partition.shape.insert(position, 1) - - if d._all_axes: - d._all_axes += (axis,) - # HDF chunks - if self._HDF_chunks: - self._HDF_chunks[axis] = 1 + dx = d.get_dask(copy=False) + dx = dx.reshape(shape) + d._set_dask(dx, reset_mask_hardness=False) - # Expand dims in the auxiliary mask - if d._auxiliary_mask: - for mask in d._auxiliary_mask: - mask.insert_dimension(position, inplace=True) - # --- End: if + # Expand _axes + axis = new_axis_identifier(d._axes) + data_axes = list(d._axes) + data_axes.insert(position, axis) + d._axes = data_axes return d @@ -11152,6 +9307,7 @@ def get_filenames(self): set() """ + print("TODODASK - is this still possible?") out = set( [ abspath(p.subarray.get_filename()) @@ -11415,13 +9571,7 @@ def halo( "Got {!r}, {!r}".format(X_axis, Y_axis) ) - for A, axis in zip( - ( - "X", - "Y", - ), - (X_axis, Y_axis), - ): + for A, axis in zip(("X", "Y"), (X_axis, Y_axis)): if axis not in axes: raise ValueError( "If dimensions have been identified with the " @@ -11502,7 +9652,7 @@ def halo( # Corners # ------------------------------------------------------------ if len(axes) > 1: - for indices in itertools.product( + for indices in product( *[ (slice(0, size[i]), slice(-size[i], None)) if i in axes @@ -11550,66 +9700,77 @@ def halo( return d - def has_calendar(self): - """Whether a calendar has been set. + def harden_mask(self): + """Force the mask to hard. - .. seealso:: `del_calendar`, `get_calendar`, `set_calendar`, - `has_units` + Whether the mask of a masked array is hard or soft is + determined by its `hardmask` property. `harden_mask` sets + `hardmask` to `True`. - :Returns: + .. versionadded:: TODODASK - `bool` - True if the calendar has been set, otherwise False. + .. seealso:: `hardmask`, `soften_mask` **Examples:** - >>> d.set_calendar('360_day') - >>> d.has_calendar() - True - >>> d.get_calendar() - '360_day' - >>> d.del_calendar() - >>> d.has_calendar() + >>> d = cf.Data([1, 2, 3], hardmask=False) + >>> d.hardmask False - >>> d.get_calendar() - ValueError: Can't get non-existent calendar - >>> print(d.get_calendar(None)) - None - >>> print(d.del_calendar(None)) - None + >>> d.harden_mask() + >>> d.hardmask + True + + >>> d = cf.Data([1, 2, 3], mask=[False, True, False]) + >>> d.hardmask + True + >>> d[1] = 999 + >>> print(d.array) + [1 -- 3] """ - return hasattr(self.Units, "calendar") + self._map_blocks( + cf_harden_mask, + delete_source=False, + reset_mask_hardness=False, + dtype=self.dtype, + ) + self._hardmask = True - def has_units(self): - """Whether units have been set. + def soften_mask(self): + """Force the mask to soft. - .. seealso:: `del_units`, `get_units`, `set_units`, `has_calendar` + Whether the mask of a masked array is hard or soft is + determined by its `hardmask` property. `soften_mask` sets + `hardmask` to `False`. - :Returns: + .. versionadded:: TODODASK - `bool` - True if units have been set, otherwise False. + .. seealso:: `hardmask`, `harden_mask` **Examples:** - >>> d.set_units('metres') - >>> d.has_units() + >>> d = cf.Data([1, 2, 3]) + >>> d.hardmask True - >>> d.get_units() - 'metres' - >>> d.del_units() - >>> d.has_units() + >>> d.soften_mask() + >>> d.hardmask False - >>> d.get_units() - ValueError: Can't get non-existent units - >>> print(d.get_units(None)) - None - >>> print(d.del_units(None)) - None + + >>> d = cf.Data([1, 2, 3], mask=[False, True, False], hardmask=False) + >>> d.hardmask + False + >>> d[1] = 999 + >>> print(d.array) + [ 1 999 3] """ - return hasattr(self.Units, "units") + self._map_blocks( + cf_soften_mask, + delete_source=False, + reset_mask_hardness=False, + dtype=self.dtype, + ) + self._hardmask = False @_inplace_enabled(default=False) def filled(self, fill_value=None, inplace=False): @@ -11672,6 +9833,129 @@ def filled(self, fill_value=None, inplace=False): return d + def first_element(self, verbose=None): + """Return the first element of the data as a scalar. + + If the value is deemed too expensive to compute then a + `ValueError` is raised instead. It is considered acceptable to + compute the value in the following circumstances: + + * The `force_compute` attribute is True. + + * The current log level is ``'DEBUG'``. + + * The stored computations consist only of initialisation, + subspace or copy functions. + + .. versionadded:: 4.0.0 + + .. seealso:: `last_element`, `second_element` + + :Returns: + + The first element of the data + + **Examples:** + + >>> d = cf.Data([[1, 2], [3, 4]]) + >>> d.first_element() + 1 + >>> d[0, 0] = cf.masked + >>> d.first_element() + masked + + """ + if self.can_compute(): + return super().first_element() + + raise ValueError( + "First element of the data is considered too expensive " + "to compute. Consider setting the 'force_compute' attribute, or " + "setting the log level to 'DEBUG'." + ) + + def second_element(self, verbose=None): + """Return the second element of the data as a scalar. + + If the value is deemed too expensive to compute then a + `ValueError` is raised instead. It is considered acceptable to + compute the value in the following circumstances: + + * The `force_compute` attribute is True. + + * The current log level is ``'DEBUG'``. + + * The stored computations consist only of initialisation, + subspace or copy functions. + + .. versionadded:: 4.0.0 + + .. seealso:: `last_element`, `first_element` + + :Returns: + + The second element of the data + + **Examples:** + + >>> d = cf.Data([[1, 2], [3, 4]]) + >>> d.second_element() + 2 + >>> d[0, 1] = cf.masked + >>> d.second_element() + masked + + """ + if self.can_compute(): + return super().second_element() + + raise ValueError( + "Second element of the data is considered too expensive " + "to compute. Consider setting the 'force_compute' atribute, or " + "setting the log level to 'DEBUG'." + ) + + def last_element(self): + """Return the last element of the data as a scalar. + + If the value is deemed too expensive to compute then a + `ValueError` is raised instead. It is considered acceptable to + compute the value in the following circumstances: + + * The `force_compute` attribute is True. + + * The current log level is ``'DEBUG'``. + + * The stored computations consist only of initialisation, + subspace or copy functions. + + .. versionadded:: 4.0.0 + + .. seealso:: `first_element`, `second_element` + + :Returns: + + The last element of the data + + **Examples:** + + >>> d = cf.Data([[1, 2], [3, 4]]) + >>> d.last_element() + 4 + >>> d[1, 1] = cf.masked + >>> d.last_element() + masked + + """ + if self.can_compute(): + return super().last_element() + + raise ValueError( + "First element of the data is considered too expensive " + "to compute. Consider setting the 'force_compute' attribute, or " + "setting the log level to 'DEBUG'." + ) + def flat(self, ignore_masked=True): """Return a flat iterator over elements of the data array. @@ -11722,6 +10006,8 @@ def flat(self, ignore_masked=True): def flatten(self, axes=None, inplace=False): """Flatten axes of the data. + TODODASK - check against daask flatten behaviour + Any subset of the axes may be flattened. The shape of the data may change, but the size will not. @@ -11839,7 +10125,7 @@ def flatten(self, axes=None, inplace=False): return d new_shape = [n for i, n in enumerate(shape) if i not in axes] - new_shape.insert(axes[0], numpy_prod([shape[i] for i in axes])) + new_shape.insert(axes[0], np.prod([shape[i] for i in axes])) out = d.empty(new_shape, dtype=d.dtype, units=d.Units, chunk=True) out.hardmask = False @@ -11895,7 +10181,7 @@ def floor(self, inplace=False, i=False): [-2. -2. -2. -1. 0. 1. 1. 1. 1.] """ - return self.func(numpy_floor, out=True, inplace=inplace) + return self.func(np.floor, out=True, inplace=inplace) @_deprecated_kwarg_check("i") def outerproduct(self, e, inplace=False, i=False): @@ -11948,14 +10234,14 @@ def outerproduct(self, e, inplace=False, i=False): [18 21 24 27]]] """ - e_ndim = numpy_ndim(e) + e_ndim = np.ndim(e) if e_ndim: if inplace: d = self else: d = self.copy() - for j in range(numpy_ndim(e)): + for j in range(np.ndim(e)): d.insert_dimension(-1, inplace=True) else: d = self @@ -12037,25 +10323,7 @@ def override_units(self, units, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) - units = Units(units) - - config = self.partition_configuration(readonly=False) - - for partition in d.partitions.matrix.flat: - p_units = partition.Units - if not p_units or p_units == units: - # No need to create the data array if the sub-array - # units are the same as the master data array units or - # the partition units are not set - partition.Units = units - continue - - partition.open(config) - partition.array - partition.Units = units - partition.close() - - d._Units = units + d._Units = Units(units) return d @@ -12087,17 +10355,6 @@ def override_calendar(self, calendar, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) - - if not self.Units.isreftime: - raise ValueError( - "Can't override the calendar of non-reference-time " - "units: {0!r}".format(self.Units) - ) - - for partition in d.partitions.matrix.flat: - partition.Units = Units(partition.Units._units, calendar) - partition.close() - d._Units = Units(d.Units._units, calendar) return d @@ -12117,6 +10374,7 @@ def to_disk(self): >>> d.to_disk() """ + print("TODODASK - ???") config = self.partition_configuration(readonly=True, to_disk=True) for partition in self.partitions.matrix.flat: @@ -12153,6 +10411,7 @@ def to_memory(self, regardless=False, parallelise=False): >>> d.to_memory(regardless=True) """ + print("TODODASK - ???") config = self.partition_configuration(readonly=True) fm_threshold = cf_fm_threshold() @@ -12185,6 +10444,7 @@ def in_memory(self): >>> d.in_memory """ + print("TODODASK - ???") for partition in self.partitions.matrix.flat: if not partition.in_memory: return False @@ -12192,47 +10452,13 @@ def in_memory(self): return True - def partition_boundaries(self): - """Return the partition boundaries for each partition matrix - dimension. - - :Returns: - - `dict` - - **Examples:** - - """ - return self.partitions.partition_boundaries(self._axes) - - def partition_configuration(self, readonly, **kwargs): - """Return parameters for opening and closing array partitions. - - If dtype=None then data-type checking is disabled. - - """ - config = { - "readonly": readonly, - "axes": self._axes, - "flip": self._flip(), - "hardmask": self.hardmask, - "auxiliary_mask": self._auxiliary_mask, - "units": self.Units, - "dtype": self._dtype, - "func": None, - "update": True, - "serial": True, - } - - if kwargs: - config.update(kwargs) - - return config - def datum(self, *index): """Return an element of the data array as a standard Python scalar. + TODODASK: consider renameing/aliasing to 'item'. Might depend + on whether or not the APIs are the same. + The first and last elements are always returned with ``d.datum(0)`` and ``d.datum(-1)`` respectively, even if the data array is a scalar array or has two or more dimensions. @@ -12328,16 +10554,16 @@ def datum(self, *index): index = index[0] if index == 0: # This also works for scalar arrays - index = (slice(0, 1),) * self._ndim + index = (slice(0, 1),) * self.ndim elif index == -1: # This also works for scalar arrays - index = (slice(-1, None),) * self._ndim + index = (slice(-1, None),) * self.ndim elif isinstance(index, int): if index < 0: index += self._size - index = numpy_unravel_index(index, self._shape) - elif len(index) == self._ndim: + index = np.unravel_index(index, self.shape) + elif len(index) == self.ndim: index = tuple(index) else: raise ValueError( @@ -12345,7 +10571,7 @@ def datum(self, *index): self.__class__.__name__ ) ) - elif n_index != self._ndim: + elif n_index != self.ndim: raise ValueError( "Incorrect number of indices for {} array".format( self.__class__.__name__ @@ -12354,7 +10580,7 @@ def datum(self, *index): array = self[index].array - elif self._size == 1: + elif self.size == 1: array = self.array else: @@ -12363,11 +10589,11 @@ def datum(self, *index): "Python scalar".format(self.__class__.__name__) ) - if not numpy_ma_isMA(array): + if not np.ma.isMA(array): return array.item() mask = array.mask - if mask is numpy_ma_nomask or not mask.item(): + if mask is np.ma.nomask or not mask.item(): return array.item() return cf_masked @@ -12436,9 +10662,9 @@ def mask_invalid(self, inplace=False, i=False): partition.open(config) array = partition.array - array = numpy_ma_masked_invalid(array, copy=False) + array = np.ma.masked_invalid(array, copy=False) array.shrink_mask() - if array.mask is numpy_ma_nomask: + if array.mask is np.ma.nomask: array = array.data partition.subarray = array @@ -12578,9 +10804,9 @@ def masked_all(cls, shape, dtype=None, units=None, chunk=True): """ array = FilledArray( shape=tuple(shape), - size=functools_reduce(operator_mul, shape, 1), + size=reduce(mul, shape, 1), ndim=len(shape), - dtype=numpy_dtype(dtype), + dtype=np.dtype(dtype), fill_value=cf_masked, ) @@ -12632,6 +10858,7 @@ def mid_range( _preserve_partitions=_preserve_partitions, ) + @daskified(daskified_log_level) @_deprecated_kwarg_check("i") @_inplace_enabled(default=False) def flip(self, axes=None, inplace=False, i=False): @@ -12670,58 +10897,23 @@ def flip(self, axes=None, inplace=False, i=False): d = _inplace_enabled_define_and_cleanup(self) if axes is not None and not axes and axes != 0: - # Null flip - if inplace: - d = None return d if axes is None: - iaxes = list(range(d._ndim)) + iaxes = range(d.ndim) else: - iaxes = d._parse_axes(axes) # , 'flip') + iaxes = d._parse_axes(axes) - # reverse = d._flip[:] - reverse = d._flip()[:] - data_axes = d._axes - partitions = d.partitions - _pmaxes = partitions.axes - - flip_partition_matrix = False - if _pmaxes: - indices = [slice(None)] * partitions.ndim - - for i in iaxes: - axis = data_axes[i] - - if axis in reverse: - reverse.remove(axis) - else: - reverse.append(axis) - - if axis in _pmaxes: - # This flip axis is also an axis of the partition - # matrix - indices[_pmaxes.index(axis)] = slice(None, None, -1) - flip_partition_matrix = True - # --- End: for - - d._flip(reverse) - # d._flip = reverse - - if flip_partition_matrix: - # TODO some problem here with d[:, 1:-2:-1] *= 10 + if not iaxes: + return d - # At least one of the flip axes is also an axis of the - # partition matrix - partitions = partitions[tuple(indices)] - partitions.set_location_map(data_axes) - d.partitions = partitions + index = [ + slice(None, None, -1) if i in axes else slice(None) for i in iaxes + ] - # Flip the auxiliary mask - if d._auxiliary_mask: - for mask in d._auxiliary_mask: - mask.flip(iaxes, inplace=True) - # --- End: if + dx = d.get_dask(copy=False) + dx = dx[tuple(index)] + d._set_dask(dx, reset_mask_hardness=False) return d @@ -12744,7 +10936,8 @@ def HDF_chunks(self, *chunks): chunks = chunks[0] if chunks is None: - # Clear all chunking + # Clear all chunking. Never change the value of the + # _HDF_chunks attribute in-place. self._HDF_chunks = None return org_HDF_chunks @@ -12752,9 +10945,10 @@ def HDF_chunks(self, *chunks): for axis, size in chunks.items(): _HDF_chunks[axes[axis]] = size - if _HDF_chunks.values() == [None] * self._ndim: + if _HDF_chunks.values() == [None] * self.ndim: _HDF_chunks = None + # Never change the value of the _HDF_chunks attribute in-place self._HDF_chunks = _HDF_chunks return org_HDF_chunks @@ -12873,7 +11067,7 @@ def rint(self, inplace=False, i=False): [-2. -2. -1. -1. 0. 1. 1. 2. 2.] """ - return self.func(numpy_rint, out=True, inplace=inplace) + return self.func(np.rint, out=True, inplace=inplace) def root_mean_square( self, @@ -12999,7 +11193,7 @@ def round(self, decimals=0, inplace=False, i=False): """ return self.func( - numpy_round, out=True, inplace=inplace, decimals=decimals + np.round, out=True, inplace=inplace, decimals=decimals ) def stats( @@ -13254,12 +11448,6 @@ def swapaxes(self, axis0, axis1, inplace=False, i=False): iaxes[axis1], iaxes[axis0] = axis0, axis1 d.transpose(iaxes, inplace=True) - # Swap axes in the auxiliary mask - if d._auxiliary_mask: - for mask in d._auxiliary_mask: - mask.swapaxes(axis0, axis1, inplace=True) - # --- End: if - return d def save_to_disk(self, itemsize=None): @@ -13296,7 +11484,7 @@ def fits_in_memory(self, itemsize): # Note that self._size*(itemsize+1) is the array size in bytes # including space for a full boolean mask # ------------------------------------------------------------ - return self._size * (itemsize + 1) <= free_memory() - cf_fm_threshold() + return self.size * (itemsize + 1) <= free_memory() - cf_fm_threshold() def fits_in_one_chunk_in_memory(self, itemsize): """Return True if the master array is small enough to be @@ -13330,85 +11518,99 @@ def fits_in_one_chunk_in_memory(self, itemsize): @_deprecated_kwarg_check("i") @_inplace_enabled(default=False) @_manage_log_level_via_verbosity + @daskified(daskified_log_level) def where( self, condition, x=None, y=None, inplace=False, i=False, verbose=None ): - """Assign to data elements depending on a condition. - - Data can be changed by assigning to elements that are selected by - a condition based on the data values. + """Assign array elements depending on a condition. - Different values can be assigned to where the conditions are, and - are not, met. + The elements to be changed are identified by a + condition. Different values can be assigned according to where + the condition is True (assignment from the *x* parameter) or + False (assignment from the *y* parameter). **Missing data** - Data array elements may be set to missing values by assigning them - to the `cf.masked` constant, or by assignment missing data - elements of array-valued *x* and *y* parameters. + Array elements may be set to missing values if either *x* or + *y* are the `cf.masked` constant, or by assignment from any + missing data elements in *x* or *y*. - By default the data mask is "hard", meaning that masked values can - not be changed by assigning them to another value. This behaviour - may be changed by setting the `hardmask` attribute to `False`, - thereby making the data mask "soft" and allowing masked elements - to be set to non-masked values. + If the data mask is hard (see the `hardmask` attribute) then + missing data values in the array will not be overwritten, + regardless of the content of *x* and *y*. + + If the *condition* contains missing data then the + corresponding elements in the array will not be assigned to, + regardless of the contents of *x* and *y*. + + **Broadcasting** + + The array and the *condition*, *x* and *y* parameters must all + be broadcastable to each other, such that the shape of the + result is identical to the orginal shape of the array. + + If *condition* is a `Query` object then for the purposes of + broadcasting, the condition is considered to be that which is + produced by applying the query to the array. + + **Performance** + + If any of the shapes of the *condition*, *x*, or *y* + parameters, or the array, is unknown, then there is a + possibility that an unknown shape will need to be calculated + immediately by executing all delayed operations on that + object. .. seealso:: `cf.masked`, `hardmask`, `__setitem__` :Parameters: - condition: - The condition which determines how to assign values to the - data. + condition: array-like or `Query` + The condition which determines how to assign values to + the data. + + Assignment from the *x* and *y* parameters will be + done where elements of the condition evaluate to + `True` and `False` respectively. - In general it may be any scalar or array-like object (such - as a numpy array or `Data` instance) that is broadcastable - to the shape of the data. Assignment from the *x* and *y* - parameters will be done where elements of the condition - evaluate to `True` and `False` respectively. + If *condition* is a `Query` object then this implies a + condition defined by applying the query to the data. *Parameter example:* - ``d.where(d.data<0, x=-999)`` will set all data values that - are less than zero to -999. + ``d.where(d < 0, x=-999)`` will set all data + values that are less than zero to -999. *Parameter example:* - ``d.where(True, x=-999)`` will set all data values to - -999. This is equivalent to ``d[...] = -999``. + ``d.where(True, x=-999)`` will set all data values + to -999. This is equivalent to ``d[...] = -999``. *Parameter example:* - ``d.where(False, y=-999)`` will set all data values to - -999. This is equivalent to ``d[...] = -999``. + ``d.where(False, y=-999)`` will set all data values + to -999. This is equivalent to ``d[...] = -999``. *Parameter example:* - If data ``d`` has shape ``(5, 3)`` then ``d.where([True, + If ``d`` has shape ``(5, 3)`` then ``d.where([True, False, True], x=-999, y=cf.masked)`` will set data - values in columns 0 and 2 to -999, and data values in - column 1 to missing data. This works because the + values in columns 0 and 2 to -999, and data values + in column 1 to missing data. This works because the condition has shape ``(3,)`` which broadcasts to the data shape. - If *condition* is a `Query` object then this implies a - condition defined by applying the query to the data. - *Parameter example:* - ``d.where(cf.lt(0), x=-999)`` will set all data values - that are less than zero to -999. This is equivalent to - ``d.where(d<0, x=-999)``. - - x, y: *optional* - Specify the assignment values. Where the condition - evaluates to `True`, assign to the data from *x*, and - where the condition evaluates to `False`, assign to the - data from *y*. The *x* and *y* parameters are each one of: + ``d.where(cf.lt(0), x=-999)`` will set all data + values that are less than zero to -999. This is + equivalent to ``d.where(d < 0, x=-999)``. - * `None`. The appropriate data elements array are - unchanged. This the default. + x, y: array-like or `None` + Specify the assignment values. Where the condition is + True assign to the data from *x*, and where the + condition is False assign to the data from *y*. - * Any scalar or array-like object (such as a numpy array, - or `Data` instance) that is broadcastable to the shape - of the data. + If *x* is `None` (the default) then no assignment is + carried out where the condition is True. - .. + If *y* is `None` (the default) then no assignment is + carried out where the condition is False. *Parameter example:* ``d.where(condition)``, for any ``condition``, returns @@ -13419,6 +11621,12 @@ def where( sign of all negative data values, and set all other data values to missing data. + *Parameter example:* + ``d.where(cf.lt(0), x=-d)`` will change the sign of + all negative data values, and leave all other data + values unchanged. This is equivalent to, but faster + than, ``d.where(cf.lt(0), x=-d, y=d)`` + {{inplace: `bool`, optional}} {{verbose: `int` or `str` or `None`, optional}} @@ -13431,356 +11639,152 @@ def where( The new data with updated values, or `None` if the operation was in-place. - **Examples:** - - """ - - def _slice_to_partition(data, indices): - """Return a numpy array for the part of the input data which - spans the given indices. - - :Parameters: - - data: `cf.Data` - - indices: `tuple` - - :Returns: - - `numpy.ndarray` - - """ - indices2 = [ - (slice(0, 1) if n == 1 else i) - for n, i in zip(data.shape[::-1], indices[::-1]) - ] - - return data[tuple(indices2)[::-1]].array - - # --- End: def - - def _is_broadcastable(data0, data1, do_not_broadcast, is_scalar): - """Check that the data1 is broadcastable to data0 and return - data1, as a python scalar if possible. + **Examples** - .. note:: The input lists are updated inplace. + >>> d = cf.Data([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + >>> e = d.where(d < 5, d, 10 * d) + >>> print(e.array) + [ 0 1 2 3 4 50 60 70 80 90] - :Parameters: + >>> d = cf.Data([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'km') + >>> e = d.where(d < 5, cf.Data(10000 * d, 'metre')) + >>> print(e.array) + [ 0. 10. 20. 30. 40. 5. 6. 7. 8. 9.] - data0: `Data` + >>> e = d.where(d < 5, cf.masked) + >>> print(e.array) + [-- -- -- -- -- 5 6 7 8 9] - data1: `Data` + >>> d = cf.Data([[1, 2,], + ... [3, 4]]) + >>> e = d.where([[True, False], [True, True]], d, [[9, 8], [7, 6]]) + >>> print(e.array) + [[1 8] + [3 4]] + >>> e = d.where([[True, False], [True, True]], [[9, 8], [7, 6]]) + >>> print(e.array) + [[9 2] + [7 6]] - do_not_broadcast: `list` + The shape of the result must have the same shape as the + original data: - is_scalar: `list` + >>> e = d.where([True, False], [9, 8]) + >>> print(e.array) + [[9 2] + [9 4]] - :Returns: + >>> d = cf.Data(np.array([[0, 1, 2], + ... [0, 2, 4], + ... [0, 3, 6]])) + >>> d.where(d < 4, None, -1) + >>> print(e.array) + [[ 0 1 2] + [ 0 2 -1] + [ 0 3 -1]] + + >>> x, y = np.ogrid[:3, :4] + >>> print(x) + [[0] + [1] + [2]] + >>> print(y) + [[0 1 2 3]] + >>> condition = x < y + >>> print(condition) + [[False True True True] + [False False True True] + [False False False True]] + >>> d = cf.Data(x) + >>> e = d.where(condition, d, 10 + y) + ... + ValueError: where: Broadcasting the 'condition' parameter with shape (3, 4) would change the shape of the data with shape (3, 1) + + >>> d = cf.Data(np.arange(9).reshape(3, 3)) + >>> e = d.copy() + >>> e[1, 0] = cf.masked + >>> f = e.where(d > 5, None, -3.1416) + >>> print(f.array) + [[-3.1416 -3.1416 -3.1416] + [-- -3.1416 -3.1416] + [6.0 7.0 8.0]] + >>> e.soften_mask() + >>> f = e.where(d > 5, None, -3.1416) + >>> print(f.array) + [[-3.1416 -3.1416 -3.1416] + [-3.1416 -3.1416 -3.1416] + [ 6. 7. 8. ]] - `Data` or scalar - Return *data1* or, if possible, ``data1.datum(0)``. + """ + d = _inplace_enabled_define_and_cleanup(self) - """ - shape0 = data0._shape - shape1 = data1._shape - size1 = data1._size - - if shape1 == shape0: - do_not_broadcast.append(True) - is_scalar.append(False) - - elif size1 == 1: - do_not_broadcast.append(False) - is_scalar.append(True) - # Replace data1 with its scalar value - data1 = data1.datum(0) - - elif data1._ndim <= data0._ndim and size1 < data0._size: - do_not_broadcast.append(False) - is_scalar.append(False) - for n, m in zip(shape1[::-1], shape0[::-1]): - if n != m and n != 1: - raise ValueError( - "where: Can't broadcast data with shape {} to " - "shape {}".format(shape1, shape0) - ) - else: - raise ValueError( - "where: Can't broadcast data with shape {} to " - "shape {}".format(shape1, shape0) - ) + units = d.Units + dx = d.get_dask(copy=False) - return data1 + # Parse condition + if getattr(condition, "isquery", False): + # Condition is a cf.Query object: Make sure that the + # condition units are OK, and convert the condition to a + # boolean dask array with the same shape as the data. + condition = condition.copy() + condition = condition.set_condition_units(units) + condition = condition.evaluate(d) - # --- End: def + condition = type(self).asdata(condition) + _where_broadcastable(d, condition, "condition") - d = _inplace_enabled_define_and_cleanup(self) + # If x or y is self then change it to None. This prevents an + # unnecessary copy; and, at compute time, an unncessary numpy + # where. + if x is self: + x = None - logger.debug(" data.shape = {}".format(d.shape)) # pragma: no cover - logger.debug( - " condition = {!r}".format(condition) - ) # pragma: no cover + if y is self: + y = None if x is None and y is None: - # The data is unchanged regardless of condition - if inplace: - d = None + # The data is unchanged regardless of the condition return d - do_not_broadcast = [] - is_scalar = [] - - # # ------------------------------------------------------------ - # # Make sure that the condition is a cf.Data object - # # ------------------------------------------------------------ - # - # if not isinstance(condition, d.__class__): - # condition = type(d)(condition) - - # ------------------------------------------------------------ - # Check that the input condition is broadcastable - # ------------------------------------------------------------ - condition = Data.asdata(condition, copy=False) - condition = _is_broadcastable( - d, condition, do_not_broadcast, is_scalar - ) - - # if isinstance(condition, Query): - # condition = condition.evaluate(f).Data - # ------------------------------------------------------------ - # Parse inputs x and y so that each is one of A) None, B) a - # scalar or C) a data array with the same shape as the master - # array - # ------------------------------------------------------------ + # Parse x and y xy = [] - for value in (x, y): - if value is None or value is cf_masked: - do_not_broadcast.append(False) - is_scalar.append(True) - - else: - # Make sure that the value is a cf.Data object and has - # compatible units - if not isinstance(value, d.__class__): - value = type(d)(value) - else: - if value.Units.equivalent(d.Units): - if not value.Units.equals(d.Units): - value = value.copy() - value.Units = d.Units - elif value.Units: - raise ValueError( - "where: Can't assign values with " - "units {!r} to data with units {!r}".format( - value.Units, d.Units - ) - ) - # --- End: if - - # Check that the value is broadcastable - value = _is_broadcastable( - d, value, do_not_broadcast, is_scalar - ) - # --- End: if - - xy.append(value) - # --- End: for - - (x, y) = xy - (condition_is_scalar, x_is_scalar, y_is_scalar) = is_scalar - broadcast = not any(do_not_broadcast) - - logger.debug(" x = {!r}".format(x)) # pragma: no cover - logger.debug(" y = {!r}".format(y)) # pragma: no cover - logger.debug( - " condition_is_scalar = {!r}".format(condition_is_scalar) - ) # pragma: no cover - logger.debug( - " x_is_scalar = {!r}".format(x_is_scalar) - ) # pragma: no cover - logger.debug( - " y_is_scalar = {!r}".format(y_is_scalar) - ) # pragma: no cover - logger.debug( - " broadcast = {!r}".format(broadcast) - ) # pragma: no cover - - # ------------------------------------------------------------- - # Try some short cuts if the condition is a scalar - # ------------------------------------------------------------- - if condition_is_scalar and not getattr(condition, "isquery", False): - logger.debug( - " Condition is a scalar: {} {}".format( - condition, type(condition) - ) - ) - if condition: - if x is not None: - d[...] = x - - if inplace: - d = None - return d - else: - if y is not None: - d[...] = y - - if inplace: - d = None - return d - # --- End: if - - # Still here? - hardmask = d.hardmask - config = d.partition_configuration(readonly=False) # or True? - - for partition in d.partitions.matrix.flat: - logger.debug(" Partition:") # pragma: no cover - - partition.open(config) - array = partition.array - # -------------------------------------------------------- - # Find the master array indices for this partition - # -------------------------------------------------------- - shape = array.shape - indices = partition.indices - - # -------------------------------------------------------- - # Find the condition for this partition - # -------------------------------------------------------- - if getattr(condition, "isquery", False): - if hasattr(condition._value, "_Units"): - # Ensure query data has equal units before evaluation - orig_condition_units = condition._value._Units - p_units = partition.Units - if orig_condition_units.equivalent(p_units): - if not orig_condition_units.equals(p_units): - # Convert equivalent units to equal units - condition._value._Units = p_units - else: - raise ValueError( - "where: Can't apply a query condition with " - "units '{!s}' on data with non-equivalent " - "units '{!s}'".format( - orig_condition_units, p_units - ) - ) - c = condition.evaluate(array) - elif condition_is_scalar: - c = condition - else: - c = _slice_to_partition(condition, indices) - - c_masked = numpy_ma_isMA(c) and numpy_ma_is_masked(c) + for arg, name in zip((x, y), ("x", "y")): + if arg is None: + xy.append(arg) + continue - # -------------------------------------------------------- - # Find value to use where condition is True for this - # partition - # -------------------------------------------------------- - if x_is_scalar: - if x is None: - # Use d - T = array - T_masked = partition.masked - else: - T = x - T_masked = x is cf_masked - else: - T = _slice_to_partition(x, indices) - T_masked = numpy_ma_isMA(T) and numpy_ma_is_masked(T) + if arg is cf_masked: + # Replace masked constant with array + xy.append(scalar_masked_array(self.dtype)) + continue - # -------------------------------------------------------- - # Find value to use where condition is False for this - # partition - # -------------------------------------------------------- - if y_is_scalar: - if y is None: - # Use d - F = array - F_masked = partition.masked - else: - F = y - F_masked = y is cf_masked - else: - F = _slice_to_partition(y, indices) - F_masked = numpy_ma_isMA(F) and numpy_ma_is_masked(F) + arg = type(self).asdata(arg) + _where_broadcastable(d, arg, name) - # -------------------------------------------------------- - # Make sure that at least one of the arrays is the same - # shape as the partition - # -------------------------------------------------------- - if broadcast: - if x is cf_masked or y is cf_masked: - c = _broadcast(c, shape) - else: - max_sizes = max( - (numpy_size(c), numpy_size(T), numpy_size(F)) + if arg.Units: + # Make sure that units are OK. + arg = arg.copy() + try: + arg.Units = units + except ValueError: + raise ValueError( + f"where: {name!r} parameter units {arg.Units!r} " + f"are not equivalent to data units {units!r}" ) - if numpy_size(c) == max_sizes: - c = _broadcast(c, shape) - elif numpy_size(T) == max_sizes: - T = _broadcast(T, shape) - else: - F = _broadcast(F, shape) - # --- End: if - - logger.debug(" array = {}".format(array)) # pragma: no cover - logger.debug(" c = {}".format(c)) # pragma: no cover - logger.debug(" T = {}".format(T)) # pragma: no cover - logger.debug(" F = {}".format(F)) # pragma: no cover - - # -------------------------------------------------------- - # Create a numpy array which takes vales from T where c - # is True and from F where c is False - # -------------------------------------------------------- - if T_masked or F_masked: - # T and/or F have missing data - new = numpy_ma_where(c, T, F) - if c_masked: - new = numpy_ma_where(c.mask, array, new) - - if partition.masked: - if hardmask: - # The original partition has missing data and - # a hardmask, so apply the original - # partition's mask to the new array. - new.mask |= array.mask - elif not numpy_ma_is_masked(new): - # The original partition has missing data and - # a softmask and the new array doesn't have - # missing data, so turn the new array into an - # unmasked array. - new = new.data[...] - - elif not numpy_ma_is_masked(new): - # The original partition doesn't have missing data - # and neither does the new array, so turn the new - # array into an unmasked array. - new = new.data[...] - else: - # Neither T nor F have missing data - new = numpy_where(c, T, F) - if c_masked: - new = numpy_ma_where(c.mask, array, new) - - if partition.masked and hardmask: - # The original partition has missing data and a - # hardmask, so apply the original partition's mask - # to the new array. - new = numpy_ma_masked_where(array.mask, new, copy=False) - # --- End: if + xy.append(arg.get_dask(copy=False)) - # -------------------------------------------------------- - # Replace the partition's subarray with the new numpy - # array - # -------------------------------------------------------- - logger.debug(" new = {}".format(new)) # pragma: no cover + x, y = xy - partition.subarray = new + # Apply the where operation + dx = da.core.elemwise( + cf_where, dx, dask_compatible(condition), x, y, d.hardmask + ) + d._set_dask(dx) - partition.close() - # --- End: for + # Note: No need to run `_reset_mask_hardness` at this point + # because the mask hardness has already been correctly + # set in `cf_where`. return d @@ -13836,7 +11840,7 @@ def sin(self, inplace=False, i=False): if d.Units.equivalent(_units_radians): d.Units = _units_radians - d.func(numpy_sin, units=_units_1, inplace=True) + d.func(np.sin, units=_units_1, inplace=True) return d @@ -13893,8 +11897,7 @@ def sinh(self, inplace=False): if d.Units.equivalent(_units_radians): d.Units = _units_radians - d.func(numpy_sinh, units=_units_1, inplace=True) - + d.func(np.sinh, units=_units_1, inplace=True) return d @_inplace_enabled(default=False) @@ -13948,7 +11951,7 @@ def cosh(self, inplace=False): if d.Units.equivalent(_units_radians): d.Units = _units_radians - d.func(numpy_cosh, units=_units_1, inplace=True) + d.func(np.cosh, units=_units_1, inplace=True) return d @@ -14006,7 +12009,7 @@ def tanh(self, inplace=False): if d.Units.equivalent(_units_radians): d.Units = _units_radians - d.func(numpy_tanh, units=_units_1, inplace=True) + d.func(np.tanh, units=_units_1, inplace=True) return d @@ -14031,35 +12034,37 @@ def log(self, base=None, inplace=False, i=False): d = _inplace_enabled_define_and_cleanup(self) if base is None: - d.func(numpy_log, units=_units_1, inplace=True) + d.func(np.log, units=_units_1, inplace=True) elif base == 10: - d.func(numpy_log10, units=_units_1, inplace=True) + d.func(np.log10, units=_units_1, inplace=True) elif base == 2: - d.func(numpy_log2, units=_units_1, inplace=True) + d.func(np.log2, units=_units_1, inplace=True) else: - d.func(numpy_log, units=_units_1, inplace=True) - d /= numpy_log(base) + d.func(np.log, units=_units_1, inplace=True) + d /= np.log(base) return d + @daskified(daskified_log_level) @_deprecated_kwarg_check("i") @_inplace_enabled(default=False) def squeeze(self, axes=None, inplace=False, i=False): """Remove size 1 axes from the data array. - By default all size 1 axes are removed, but particular axes may be - selected with the keyword arguments. + By default all size 1 axes are removed, but particular axes + may be selected with the keyword arguments. - .. seealso:: `flatten`, `insert_dimension`, `flip`, `swapaxes`, - `transpose` + .. seealso:: `flatten`, `insert_dimension`, `flip`, + `swapaxes`, `transpose` :Parameters: axes: (sequence of) int, optional - Select the axes. By default all size 1 axes are - removed. The *axes* argument may be one, or a sequence, of - integers that select the axis corresponding to the given - position in the list of axes of the data array. + Select the axes. By default all size 1 axes are + removed. The *axes* argument may be one, or a + sequence, of integers that select the axis + corresponding to the given position in the list of + axes of the data array. No axes are removed if *axes* is an empty sequence. @@ -14101,19 +12106,21 @@ def squeeze(self, axes=None, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) - ndim = d._ndim - if not ndim: + # TODODASK - check if axis parsing is done in dask + + if not d.ndim: if axes or axes == 0: raise ValueError( "Can't squeeze: Can't remove an axis from " - "scalar {}".format(d.__class__.__name__) + f"scalar {d.__class__.__name__}" ) if inplace: d = None + return d - shape = list(d._shape) + shape = d.shape if axes is None: axes = [i for i, n in enumerate(shape) if n == 1] @@ -14124,71 +12131,29 @@ def squeeze(self, axes=None, inplace=False, i=False): for i in axes: if shape[i] > 1: raise ValueError( - "Can't squeeze {}: Can't remove axis of " - "size {}".format(d.__class__.__name__, shape[i]) + f"Can't squeeze {d.__class__.__name__}: " + f"Can't remove axis of size {shape[i]}" ) # --- End: if if not axes: - if inplace: - d = None return d # Still here? Then the data array is not scalar and at least # one size 1 axis needs squeezing. - data_axes = d._axes[:] - # flip = d._flip[:] - flip = d._flip()[:] - - if not d._all_axes: - d._all_axes = tuple(data_axes) - - i_axis = [] - for axis in [data_axes[i] for i in axes]: - if axis in flip: - flip.remove(axis) - - i = data_axes.index(axis) - shape.pop(i) - data_axes.pop(i) - - i_axis.append((i, axis)) - - for partition in d.partitions.matrix.flat: - p_location = partition.location[:] - p_shape = partition.shape[:] - p_flip = partition.flip[:] - - for i, axis in i_axis: - p_location.pop(i) - p_shape.pop(i) - if axis in p_flip: - p_flip.remove(axis) - # --- End: for - - partition.location = p_location - partition.shape = p_shape - partition.flip = p_flip + dx = d.get_dask(copy=False) + dx = dx.squeeze(axis=tuple(axes)) + d._set_dask(dx, reset_mask_hardness=False) - d._ndim = len(shape) - d._shape = tuple(shape) + # Remove the squeezed axes names + d._axes = [axis for i, axis in enumerate(d._axes) if i not in axes] - # Remove squeezed axes from list of cyclic axes - for a in axes: - d._cyclic.discard(d._axes[a]) - - d._axes = data_axes - # d._flip = flip - d._flip(flip) - - # Remove size 1 partition dimensions - d.partitions.squeeze(inplace=True) - - # Squeeze the auxiliary mask - if d._auxiliary_mask: - for mask in d._auxiliary_mask: - mask.squeeze(axes, inplace=True) - # --- End: if + hdf = self._HDF_chunks + if hdf: + # Never change the value of the _HDF_chunks attribute in-place + self._HDF_chunks = { + axis: size for axis, size in hdf.items() if axis not in axes + } return d @@ -14246,7 +12211,7 @@ def tan(self, inplace=False, i=False): if d.Units.equivalent(_units_radians): d.Units = _units_radians - d.func(numpy_tan, units=_units_1, inplace=True) + d.func(np.tan, units=_units_1, inplace=True) return d @@ -14279,6 +12244,7 @@ def tolist(self): """ return self.array.tolist() + @daskified(daskified_log_level) @_deprecated_kwarg_check("i") @_inplace_enabled(default=False) def transpose(self, axes=None, inplace=False, i=False): @@ -14319,49 +12285,28 @@ def transpose(self, axes=None, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) - ndim = d._ndim - - # Parse the axes. By default, reverse the order of the axes. + ndim = d.ndim if axes is None: if ndim <= 1: return d - iaxes = tuple(range(ndim - 1, -1, -1)) else: - iaxes = d._parse_axes(axes) # , 'transpose') - - # Return unchanged if axes are in the same order as the data - if iaxes == tuple(range(ndim)): - if inplace: - d = None - return d - - if len(iaxes) != ndim: - raise ValueError( - "Can't tranpose: Axes don't match array: {}".format(iaxes) - ) - # --- End: if + iaxes = d._parse_axes(axes) - # Permute the axes + # Note: _axes attribute is still important/utilised post-Daskification + # because e.g. axes labelled as cyclic by the _cyclic attribute use it + # to determine their position (see #discussion_r694096462 on PR #247). data_axes = d._axes d._axes = [data_axes[i] for i in iaxes] - # Permute the shape - shape = d._shape - d._shape = tuple([shape[i] for i in iaxes]) - - # Permute the locations map - for partition in d.partitions.matrix.flat: - location = partition.location - shape = partition.shape - - partition.location = [location[i] for i in iaxes] - partition.shape = [shape[i] for i in iaxes] - - # Transpose the auxiliary mask - if d._auxiliary_mask: - for mask in d._auxiliary_mask: - mask.transpose(iaxes, inplace=True) + dx = d.get_dask(copy=False) + try: + dx = da.transpose(dx, axes=axes) + except ValueError: + raise ValueError( + f"Can't transpose: Axes don't match array: {axes}" + ) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -14396,7 +12341,7 @@ def trunc(self, inplace=False, i=False): [-1. -1. -1. -1. 0. 1. 1. 1. 1.] """ - return self.func(numpy_trunc, out=True, inplace=inplace) + return self.func(np.trunc, out=True, inplace=inplace) @classmethod def empty( @@ -14506,9 +12451,9 @@ def full( """ array = FilledArray( shape=tuple(shape), - size=functools_reduce(operator_mul, shape, 1), + size=reduce(mul, shape, 1), ndim=len(shape), - dtype=numpy_dtype(dtype), + dtype=np.dtype(dtype), fill_value=fill_value, ) @@ -14610,7 +12555,7 @@ def func( # Steps for masked data when want to preserve invalid values: # Step 1. extract the non-masked data and the mask separately - detach_mask = preserve_invalid and numpy_ma_isMA(array) + detach_mask = preserve_invalid and np.ma.isMA(array) if detach_mask: mask = array.mask # must store mask before detach it below array = array.data # mask detached @@ -14623,11 +12568,11 @@ def func( p_datatype = array.dtype if datatype != p_datatype: - datatype = numpy_result_type(p_datatype, datatype) + datatype = np.result_type(p_datatype, datatype) if detach_mask: # Step 3: reattach original mask onto the output data - array = numpy_ma_array(array, mask=mask) + array = np.ma_array(array, mask=mask) partition.subarray = array @@ -14688,12 +12633,16 @@ def range( _preserve_partitions=_preserve_partitions, ) + @daskified(daskified_log_level) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def roll(self, axis, shift, inplace=False, i=False): """Roll array elements along a given axis. Equivalent in function to `numpy.roll`. + TODODASK - note that it works for multiple axes + :Parameters: axis: `int` @@ -14708,6 +12657,14 @@ def roll(self, axis, shift, inplace=False, i=False): *Parameter example:* Convolve the last axis: ``axis=-1``. + shift: `int`, or `tuple` of `int` + The number of places by which elements are shifted. + If a `tuple`, then *axis* must be a tuple of the same + size, and each of the given axes is shifted by the + corresponding number. If an `int` while *axis* is a + tuple of `int`, then the same value is used for all + given axes. + {{inplace: `bool`, optional}} {{i: deprecated at version 3.0.0}} @@ -14717,49 +12674,13 @@ def roll(self, axis, shift, inplace=False, i=False): `Data` or `None` """ - if not shift: - # Null roll - if inplace: - return - - return self.copy() - - iaxes = self._parse_axes(axis) - if len(iaxes) != 1: - raise ValueError( - "Must specify a unique domain axis with the 'axis' " - "parameter. {!r} specifies axes {!r}".format(axis, iaxes) - ) - - axis = iaxes[0] - - n = self._shape[axis] - - shift %= n - - if not shift: - # Null roll - if inplace: - return - - return self.copy() - - shift = n - shift + # TODODASK - consider matching the numpy/dask api: "shift, axis=" - indices0 = [slice(None)] * self._ndim - indices0[axis] = slice(None, shift) - - indices1 = indices0[:] - indices1[axis] = slice(shift, None) - - indices0 = tuple(indices0) - indices1 = tuple(indices1) - - d = type(self).concatenate((self[indices1], self[indices0]), axis=axis) + d = _inplace_enabled_define_and_cleanup(self) - if inplace: - self.__dict__ = d.__dict__ - return + dx = d.get_dask(copy=False) + dx = da.roll(dx, shift, axis=axis) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -15007,7 +12928,7 @@ def sd( i=False, _preserve_partitions=False, ): - """Collapse axes by calculating their standard deviation. + r"""Collapse axes by calculating their standard deviation. The standard deviation may be adjusted for the number of degrees of freedom and may be calculated with weighted values. @@ -15365,49 +13286,6 @@ def variance( _preserve_partitions=_preserve_partitions, ) - # ---------------------------------------------------------------- - # Deprecated attributes and methods - # ---------------------------------------------------------------- - @property - def Data(self): - """Deprecated at version 3.0.0, use attribute `data` instead.""" - _DEPRECATION_ERROR_ATTRIBUTE( - self, "Data", "Use attribute 'data' instead." - ) # pragma: no cover - - @property - def dtvarray(self): - """Deprecated at version 3.0.0.""" - _DEPRECATION_ERROR_ATTRIBUTE(self, "dtvarray") # pragma: no cover - - def files(self): - """Deprecated at version 3.4.0, use method `get_filenames` - instead.""" - _DEPRECATION_ERROR_METHOD( - self, - "files", - "Use method `get_filenames` instead.", - version="3.4.0", - ) # pragma: no cover - - @property - def unsafe_array(self): - """Deprecated at version 3.0.0, use `array` attribute - instead.""" - _DEPRECATION_ERROR_ATTRIBUTE( - self, "unsafe_array", "Use 'array' attribute instead." - ) # pragma: no cover - - def expand_dims(self, position=0, i=False): - """Deprecated at version 3.0.0, use method `insert_dimension` - instead.""" - _DEPRECATION_ERROR_METHOD( - self, - "expand_dims", - "Use method 'insert_dimension' instead.", - version="3.0.0", - ) # pragma: no cover - # --- End: class @@ -15501,7 +13379,7 @@ def _overlapping_partitions(partitions, indices, axes, master_flip): partition.new_part(p_indices, axis_to_position, master_flip) partition.shape = shape - new_partition_matrix = numpy_empty(partitions.shape, dtype=object) + new_partition_matrix = np.empty(partitions.shape, dtype=object) new_partition_matrix[...] = partition return new_partition_matrix @@ -15543,10 +13421,10 @@ def _overlapping_partitions(partitions, indices, axes, master_flip): new_shape = [ len(set(s)) - for s in numpy_unravel_index(flat_pm_indices, partitions.shape) + for s in np.unravel_index(flat_pm_indices, partitions.shape) ] - new_partition_matrix = numpy_empty((len(flat_pm_indices),), dtype=object) + new_partition_matrix = np.empty((len(flat_pm_indices),), dtype=object) new_partition_matrix[...] = partitions_list new_partition_matrix.resize(new_shape) @@ -15554,7 +13432,7 @@ def _overlapping_partitions(partitions, indices, axes, master_flip): # -------------------------------------------------------------------- -# +# ??? # -------------------------------------------------------------------- def _getattr(x, attr): if not x: @@ -15562,7 +13440,7 @@ def _getattr(x, attr): return getattr(x, attr) -_array_getattr = numpy_vectorize(_getattr) +_array_getattr = np.vectorize(_getattr) def _broadcast(a, shape): @@ -15588,55 +13466,62 @@ def _broadcast(a, shape): """ # Replace with numpy.broadcast_to v1.10 ??/ TODO - a_shape = numpy_shape(a) + a_shape = np.shape(a) if a_shape == shape: return a tile = [(m if n == 1 else 1) for n, m in zip(a_shape[::-1], shape[::-1])] tile = shape[0 : len(shape) - len(a_shape)] + tuple(tile[::-1]) - return numpy_tile(a, tile) + return np.tile(a, tile) -class AuxiliaryMask: - """TODO.""" +def _where_broadcastable(data, x, name): + """Check broadcastability for `where` assignments. - def __init__(self): - """TODO.""" - self._mask = [] + Raises an exception if the result of broadcasting *data* and *x* + together does not have the same shape as *data*. - def __getitem__(self, indices): - """TODO.""" - new = type(self)() + .. versionadded:: TODODASK - for mask in self._mask: - mask_indices = [ - (slice(None) if n == 1 else index) - for n, index in zip(mask.shape, indices) - ] - new._mask.append(mask[tuple(mask_indices)]) + .. seealso:: `where` - return new + :Parameters: - # ---------------------------------------------------------------- - # Attributes - # ---------------------------------------------------------------- - @property - def ndim(self): - """TODO.""" - return self._mask[0].ndim + data, x: `Data` + The arrays to compare. - @property - def dtype(self): - """TODO.""" - return self._mask[0].dtype + name: `str` + A name for *x* that is used in any exception error + message. - # ---------------------------------------------------------------- - # Methods - # ---------------------------------------------------------------- - def append(self, mask): - """TODO.""" - self._mask.append(mask) + :Returns: + `bool` + If *x* is acceptably broadcastable to *data* then `True` + is returned, otherwise a `ValueError` is raised. -# --- End: class + """ + ndim_x = x.ndim + if not ndim_x: + return True + + ndim_data = data.ndim + if ndim_x > ndim_data: + raise ValueError( + f"where: Broadcasting the {name!r} parameter with {ndim_x} " + f"dimensions would change the shape of the data with " + f"{ndim_data} dimensions" + ) + + shape_x = x.shape + shape_data = data.shape + for n, m in zip(shape_x[::-1], shape_data[::-1]): + if n != m and n != 1: + raise ValueError( + f"where: Broadcasting the {name!r} parameter with shape " + f"{shape_x} would change the shape of the data with shape " + f"{shape_data}" + ) + + return True diff --git a/cf/data/filledarray.py b/cf/data/filledarray.py index 0d5227628f..8b978b3636 100644 --- a/cf/data/filledarray.py +++ b/cf/data/filledarray.py @@ -1,6 +1,5 @@ -from numpy import empty as numpy_empty +import numpy as np from numpy import full as numpy_full -from numpy.ma import masked_all as numpy_ma_masked_all from ..constants import masked as cf_masked from ..functions import parse_indices @@ -10,15 +9,7 @@ class FilledArray(abstract.Array): """An underlying filled array.""" - def __init__( - self, - dtype=None, - ndim=None, - shape=None, - size=None, - fill_value=None, - masked_all=False, - ): + def __init__(self, dtype=None, shape=None, size=None, fill_value=None): """**Initialization** :Parameters: @@ -26,11 +17,8 @@ def __init__( dtype : numpy.dtype The numpy data type of the data array. - ndim : int - Number of dimensions in the data array. - - shape : tuple - The data array's dimension sizes. + shape : tuple + The data array's dimension sizes. size : int Number of elements in the data array. @@ -41,11 +29,7 @@ def __init__( """ super().__init__( - dtype=dtype, - ndim=ndim, - shape=shape, - size=size, - fill_value=fill_value, + dtype=dtype, shape=shape, size=size, fill_value=fill_value ) def __getitem__(self, indices): @@ -77,125 +61,39 @@ def __getitem__(self, indices): a, b = divmod(stop - index.start, step) if b: a += 1 + array_shape.append(a) else: array_shape.append(len(index)) # --- End: if if self.fill_value() is cf_masked: - return numpy_ma_masked_all(array_shape, dtype=self.dtype) + return np.ma.masked_all(array_shape, dtype=self.dtype) elif self.fill_value() is not None: return numpy_full( array_shape, fill_value=self.fill_value(), dtype=self.dtype ) else: - return numpy_empty(array_shape, dtype=self.dtype) + return np.empty(array_shape, dtype=self.dtype) - # ---------------------------------------------------------------- - # Attributes - # ---------------------------------------------------------------- @property def dtype(self): - """Data-type of the data elements. - - **Examples:** - - >>> a.dtype - dtype('float64') - >>> print(type(a.dtype)) - - - """ + """Data-type of the data elements.""" return self._get_component("dtype") @property def ndim(self): - """Number of array dimensions. - - **Examples:** - - >>> a.shape - (73, 96) - >>> a.ndim - 2 - >>> a.size - 7008 - - >>> a.shape - (1, 1, 1) - >>> a.ndim - 3 - >>> a.size - 1 - - >>> a.shape - () - >>> a.ndim - 0 - >>> a.size - 1 - - """ - return self._get_component("ndim") + """Number of array dimensions.""" + return len(self.shape) @property def shape(self): - """Tuple of array dimension sizes. - - **Examples:** - - >>> a.shape - (73, 96) - >>> a.ndim - 2 - >>> a.size - 7008 - - >>> a.shape - (1, 1, 1) - >>> a.ndim - 3 - >>> a.size - 1 - - >>> a.shape - () - >>> a.ndim - 0 - >>> a.size - 1 - - """ + """Tuple of array dimension sizes.""" return self._get_component("shape") @property def size(self): - """Number of elements in the array. - - **Examples:** - - >>> a.shape - (73, 96) - >>> a.size - 7008 - >>> a.ndim - 2 - - >>> a.shape - (1, 1, 1) - >>> a.ndim - 3 - >>> a.size - 1 - - >>> a.shape - () - >>> a.ndim - 0 - >>> a.size - 1 - - """ + """Number of elements in the array.""" return self._get_component("size") def fill_value(self): @@ -211,17 +109,12 @@ def reshape(self, newshape): """Give a new shape to the array.""" new = self.copy() new.shape = newshape - new.ndim = len(newshape) return new def resize(self, newshape): """Change the shape and size of the array in-place.""" self.shape = newshape - self.ndim = len(newshape) def view(self): """Return a view of the entire array.""" return self[...] - - -# --- End: class diff --git a/cf/data/gatheredarray.py b/cf/data/gatheredarray.py index 9b63cc690d..47fc935eba 100644 --- a/cf/data/gatheredarray.py +++ b/cf/data/gatheredarray.py @@ -1,7 +1,9 @@ import cfdm +from .mixin import ArrayMixin -class GatheredArray(cfdm.GatheredArray): + +class GatheredArray(ArrayMixin, cfdm.GatheredArray): """An underlying gathered array. Compression by gathering combines axes of a multidimensional array @@ -11,9 +13,18 @@ class GatheredArray(cfdm.GatheredArray): The information needed to uncompress the data is stored in a "list variable" that gives the indices of the required points. + See CF section 8.2. "Lossless Compression by Gathering". + .. versionadded:: 3.0.0 """ + def __repr__(self): + """Called by the `repr` built-in function. + + x.__repr__() <==> repr(x) + + .. versionadded:: 3.0.0 -# --- End: class + """ + return super().__repr__().replace("<", " x[indices] - - Returns a numpy array. - - """ - # The compressed array - compressed_array = self.array - - # Initialize the full, uncompressed output array with missing - # data everywhere - uarray = numpy.ma.masked_all(self.shape, dtype=compressed_array.dtype) - - compression = self.compression - compressed_dimension = compression["compressed_dimension"] - compressed_axes = compression["compressed_axes"] - compressed_part = compression["compressed_part"] - list_array = compression["indices"] - - # Initialise the uncomprssed array - n_compressed_axes = len(compressed_axes) - - uncompressed_shape = self.shape - partial_uncompressed_shapes = [ - reduce( - mul, [uncompressed_shape[i] for i in compressed_axes[j:]], 1 - ) - for j in range(1, n_compressed_axes) - ] - - sample_indices = list(compressed_part) - u_indices = [slice(None)] * self.ndim - - full = [slice(None)] * compressed_array.ndim - - zeros = [0] * n_compressed_axes - for j, b in enumerate(list_array): - # print('b=', b, end=", ") - sample_indices[compressed_dimension] = slice(j, j + 1) - - # Note that it is important for indices a and b to be - # integers (rather than the slices a:a+1 and b:b+1) so - # that these dimensions are dropped from uarray[u_indices] - u_indices[compressed_axes[0] : compressed_axes[-1] + 1] = zeros - for i, z in zip(compressed_axes[:-1], partial_uncompressed_shapes): - if b >= z: - (a, b) = divmod(b, z) - u_indices[i] = a - # --- End: for - u_indices[compressed_axes[-1]] = b - - compressed = compressed_array[tuple(sample_indices)].array - sample_indices2 = full[:] - sample_indices2[compressed_dimension] = 0 - compressed = compressed[tuple(sample_indices2)] - - uarray[tuple(u_indices)] = compressed - # --- End: for - - if indices is Ellipsis: - return uarray - else: - indices = parse_indices(self.shape, indices) - return get_subspace(uarray, indices) - - -# --- End: class diff --git a/cf/data/mixin/__init__.py b/cf/data/mixin/__init__.py new file mode 100644 index 0000000000..537df01a21 --- /dev/null +++ b/cf/data/mixin/__init__.py @@ -0,0 +1,2 @@ +from .deprecations import DataClassDeprecationsMixin +from .arraymixin import ArrayMixin diff --git a/cf/data/mixin/arraymixin.py b/cf/data/mixin/arraymixin.py new file mode 100644 index 0000000000..df3dd686b3 --- /dev/null +++ b/cf/data/mixin/arraymixin.py @@ -0,0 +1,9 @@ +class ArrayMixin: + """Mixin class for a container of an array. + + .. versionadded:: TODODASK + + """ + + def __array_function__(self, func, types, args, kwargs): + return NotImplemented diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py new file mode 100644 index 0000000000..205e016333 --- /dev/null +++ b/cf/data/mixin/deprecations.py @@ -0,0 +1,171 @@ +from ...functions import ( + _DEPRECATION_ERROR_ATTRIBUTE, + _DEPRECATION_ERROR_METHOD, +) + + +class DataClassDeprecationsMixin: + """Deprecated attributes and methods for the Data class.""" + + @property + def Data(self): + """Deprecated at version 3.0.0, use attribute `data` instead.""" + _DEPRECATION_ERROR_ATTRIBUTE( + self, "Data", "Use attribute 'data' instead." + ) # pragma: no cover + + @property + def dtvarray(self): + """Deprecated at version 3.0.0.""" + _DEPRECATION_ERROR_ATTRIBUTE(self, "dtvarray") # pragma: no cover + + def files(self): + """Deprecated at version 3.4.0, use method `get_` instead.""" + _DEPRECATION_ERROR_METHOD( + self, + "files", + "Use method `get_filenames` instead.", + version="3.4.0", + ) # pragma: no cover + + @property + def unsafe_array(self): + """Deprecated at version 3.0.0, use `array` attribute + instead.""" + _DEPRECATION_ERROR_ATTRIBUTE( + self, "unsafe_array", "Use 'array' attribute instead." + ) # pragma: no cover + + def expand_dims(self, position=0, i=False): + """Deprecated at version 3.0.0, use method `insert_dimension` + instead.""" + _DEPRECATION_ERROR_METHOD( + self, + "expand_dims", + "Use method 'insert_dimension' instead.", + version="3.0.0", + ) # pragma: no cover + + @property + def ispartitioned(self): + """True if the data array is partitioned. + + **Examples:** + + >>> d._pmsize + 1 + >>> d.ispartitioned + False + + >>> d._pmsize + 2 + >>> d.ispartitioned + False + + """ + _DEPRECATION_ERROR_METHOD("TODODASK") + + def chunk(self, chunksize=None, total=None, omit_axes=None, pmshape=None): + """Partition the data array. + + :Parameters: + + chunksize: `int`, optional + The + + total: sequence of `int`, optional + + omit_axes: sequence of `int`, optional + + pmshape: sequence of `int`, optional + + :Returns: + + `None` + + **Examples:** + + >>> d.chunk() + >>> d.chunk(100000) + >>> d.chunk(100000, ) + >>> d.chunk(100000, total=[2]) + >>> d.chunk(100000, omit_axes=[3, 4]) + + """ + _DEPRECATION_ERROR_METHOD("TODODASK. Use 'rechunk' instead") + + @property + def ismasked(self): + """True if the data array has any masked values. + + TODODASK + + **Examples:** + + >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) + >>> print(d.ismasked) + False + >>> d[0, ...] = cf.masked + >>> d.ismasked + True + + """ + _DEPRECATION_ERROR_METHOD("TODODASK use is_masked instead") + + @property + def varray(self): + """A numpy array view the data array. + + Note that making changes to elements of the returned view changes + the underlying data. + + .. seealso:: `array`, `datetime_array` + + **Examples:** + + >>> a = d.varray + >>> type(a) + + >>> a + array([0, 1, 2, 3, 4]) + >>> a[0] = 999 + >>> d.varray + array([999, 1, 2, 3, 4]) + + """ + _DEPRECATION_ERROR_METHOD("TODODASK") + + def add_partitions(self, extra_boundaries, pdim): + """Add partition boundaries. + + :Parameters: + + extra_boundaries: `list` of `int` + The boundaries of the new partitions. + + pdim: `str` + The name of the axis to have the new partitions. + + :Returns: + + `None` + + **Examples:** + + >>> d.add_partitions( ) + + """ + _DEPRECATION_ERROR_METHOD("TODODASK Consider using rechunk instead") + + def partition_boundaries(self): + """Return the partition boundaries for each partition matrix + dimension. + + :Returns: + + `dict` + + **Examples:** + + """ + _DEPRECATION_ERROR_METHOD("TODODASK - consider using 'chunks' instead") diff --git a/cf/data/netcdfarray.py b/cf/data/netcdfarray.py index b57db83030..e000f6bb54 100644 --- a/cf/data/netcdfarray.py +++ b/cf/data/netcdfarray.py @@ -1,156 +1,7 @@ import cfdm from . import abstract -from .functions import _close_netcdf_file, _open_netcdf_file class NetCDFArray(cfdm.NetCDFArray, abstract.FileArray): - """A sub-array stored in a netCDF file.""" - - def __init__( - self, - filename=None, - ncvar=None, - varid=None, - group=None, - dtype=None, - ndim=None, - shape=None, - size=None, - mask=True, - ): - """**Initialization** - - :Parameters: - - filename: `str` - The name of the netCDF file containing the array. - - ncvar: `str`, optional - The name of the netCDF variable containing the array. Required - unless *varid* is set. - - varid: `int`, optional - The UNIDATA netCDF interface ID of the variable containing the - array. Required if *ncvar* is not set, ignored if *ncvar* is - set. - - group: `None` or sequence of `str`, optional - Specify the netCDF4 group to which the netCDF variable - belongs. By default, or if *group* is `None` or an empty - sequence, it assumed to be in the root group. The last - element in the sequence is the name of the group in which - the variable lies, with other elements naming any parent - groups (excluding the root group). - - :Parameter example: - To specify that a variable is in the root group: - ``group=()`` or ``group=None`` - - :Parameter example: - To specify that a variable is in the group '/forecasts': - ``group=['forecasts']`` - - :Parameter example: - To specify that a variable is in the group - '/forecasts/model2': ``group=['forecasts', 'model2']`` - - .. versionadded:: 3.6.0 - - dtype: `numpy.dtype` - The data type of the array in the netCDF file. May be - `None` if the numpy data-type is not known (which can be - the case for netCDF string types, for example). - - shape: `tuple` - The array dimension sizes in the netCDF file. - - size: `int` - Number of elements in the array in the netCDF file. - - ndim: `int` - The number of array dimensions in the netCDF file. - - mask: `bool`, optional - If False then do not mask by convention when reading data - from disk. By default data is masked by convention. - - A netCDF array is masked depending on the values of any of - the netCDF variable attributes ``valid_min``, - ``valid_max``, ``valid_range``, ``_FillValue`` and - ``missing_value``. - - .. versionadded:: 3.4.0 - - **Examples:** - - >>> import netCDF4 - >>> nc = netCDF4.Dataset('file.nc', 'r') - >>> v = nc.variable['tas'] - >>> a = NetCDFFileArray(filename='file.nc', ncvar='tas', - ... group=['forecast'], dtype=v.dtype, - ... ndim=v.ndim, shape=v.shape, size=v.size) - - """ - super().__init__( - filename=filename, - ncvar=ncvar, - varid=varid, - group=group, - dtype=dtype, - ndim=ndim, - shape=shape, - size=size, - mask=mask, - ) - - # By default, keep the netCDF file open after data array - # access - self._set_component("close", False, copy=False) - - @property - def file_pointer(self): - """The file pointer starting at the position of the netCDF - variable.""" - offset = getattr(self, "ncvar", None) - if offset is None: - offset = self.varid - - return (self.get_filename(), offset) - - def close(self): - """Close the file containing the data array. - - If the file is not open then no action is taken. - - :Returns: - - `None` - - **Examples:** - - >>> f.close() - - """ - _close_netcdf_file(self.get_filename()) - - def open(self): - """Return a `netCDF4.Dataset` object for the file containing the - data array. - - :Returns: - - `netCDF4.Dataset` - - **Examples:** - - >>> f.open() - - - """ - return _open_netcdf_file(self.get_filename(), "r") - - -# --- End: class - -# abstract.Array.register(NetCDFArray) + """An array stored in a netCDF file.""" diff --git a/cf/data/partitionmatrix.py b/cf/data/partitionmatrix.py deleted file mode 100644 index e784eb156a..0000000000 --- a/cf/data/partitionmatrix.py +++ /dev/null @@ -1,822 +0,0 @@ -from copy import deepcopy - -import numpy -from numpy import empty as numpy_empty -from numpy import expand_dims as numpy_expand_dims -from numpy import ndenumerate as numpy_ndenumerate -from numpy import squeeze as numpy_squeeze - -from ..decorators import _inplace_enabled, _inplace_enabled_define_and_cleanup -from ..functions import _DEPRECATION_ERROR_METHOD -from .partition import Partition - -_empty_matrix = numpy_empty((), dtype=object) - - -class PartitionMatrix: - """A hyperrectangular partition matrix of a master data array. - - Each of elements (called partitions) span all or part of exactly one - sub-array of the master data array. - - Normal numpy basic and advanced indexing is supported, but size 1 - dimensions are always removed from the output array, i.e. a partition - rather than a partition matrix is returned if the output array has - size 1. - - - **Attributes** - - ========== =========================================================== - Attribute Description - ========== =========================================================== - `!axes` - `!matrix` - `!ndim` The number of partition dimensions in the partition matrix. - `!shape` List of the partition matrix's dimension sizes. - `!size` The number of partitions in the partition matrix. - ========== =========================================================== - - """ - - def __init__(self, matrix, axes): - """**Initialization** - - :Parameters: - - matrix: `numpy.ndarray` - An array of Partition objects. - - axes: `list` - The identities of the partition axes of the partition - array. If the partition matrix is a scalar array then it - is an empty list. DO NOT UPDATE INPLACE. - - **Examples:** - - >>> pm = PartitionMatrix( - ... numpy.array(Partition( - ... location = [(0, 1), (2, 4)], - ... shape = [1, 2], - ... _dimensions = ['dim2', 'dim0'], - ... Units = cf.Units('m'), - ... part = [], - ... data = numpy.array([[5, 6], [7, 8]]) - ... ), dtype=object), - ... axes=[] - ... ) - - """ - self.matrix = matrix - self.axes = axes - - def __deepcopy__(self, memo): - """Used if copy.deepcopy is called on the variable.""" - return self.copy() - - def __getitem__(self, indices): - """x.__getitem__(indices) <==> x[indices] - - Normal numpy basic and advanced indexing is supported, but size 1 - dimensions are always removed from the output array, i.e. a - partition rather than a partition matrix is returned if the output - array has size 1. - - Returns either a partition or a partition matrix. - - **Examples:** - - >>> pm.shape - (5, 3) - >>> pm[0, 1] - - >>> pm[:, 1] - - >>> pm[:, 1].shape - (5,) - >>> pm[1:4, slice(2, 0, -1)].shape - (3, 2) - - >>> pm.shape - () - >>> pm[()] - - >>> pm[...] - - - """ - out = self.matrix[indices] - - if isinstance(out, Partition): - return out - - if out.size == 1: - return self.matrix.item() - - axes = [axis for axis, n in zip(self.axes, out.shape) if n != 1] - - return type(self)(numpy_squeeze(out), axes) - - def __repr__(self): - """x.__repr__() <==> repr(x)""" - return "" % (self.__class__.__name__, self.shape) - - def __setitem__(self, indices, value): - """x.__setitem__(indices, y) <==> x[indices]=y. - - Indices must be an integer, a slice object or a tuple. If a slice - object is given then the value being assigned must be an - iterable. If a tuple of integers (or slices equivalent to an - integer) is given then there must be one index per partition - matrix dimension. - - **Examples:** - - >>> pm.shape - (3,) - >>> pm[2] = p1 - >>> pm[:] = [p1, p2, p3] - - >>> pm.shape - (2, 3) - >>> pm[0, 2] = p1 - - - >>> pm.shape - () - >>> pm[()] = p1 - >>> pm[...] = p1 - - """ - self.matrix[indices] = value - - def __str__(self): - """x.__str__() <==> str(x)""" - return str(self.matrix) - out = [] - for partition in self.matrix.flat: - out.append(str(partition)) - - return "\n".join(out) - - def change_axis_names(self, axis_map): - """Change the axis names. - - The axis names are arbitrary, so mapping them to another arbitrary - collection does not change the data array values, units, nor axis - order. - - :Parameters: - - axis_map: `dict` - - :Returns: - - `None` - - """ - # Partition dimensions - axes = self.axes - self.axes = [axis_map[axis] for axis in axes] - - # Partitions. Note that a partition may have dimensions which - # are not in self.axes and that these must also be in - # axis_name_map. - for partition in self.matrix.flat: - partition.change_axis_names(axis_map) - - # ---------------------------------------------------------------- - # Attributes - # ---------------------------------------------------------------- - @property - def flat(self): - """A flat iterator over the partitions in the partition matrix. - - **Examples:** - - >>> pm.shape - [2, 2] - >>> for partition in pm.flat: - ... print(repr(partition.Units)) - ... - - - - - - >>> pm.flat - - - >>> flat = pm.flat - >>> next(flat) - - >>> next(flat) - - - """ - return self.matrix.flat - - @property - def ndim(self): - """The number of partition dimensions in the partition matrix. - - Not to be confused with the number of dimensions of the master - data array. - - **Examples:** - - >>> pm.shape - (8, 4) - >>> pm.ndim - 2 - - >>> pm.shape - () - >>> pm.ndim - 0 - - """ - return self.matrix.ndim - - @property - def shape(self): - """List of the partition matrix's dimension sizes. - - Not to be confused with the sizes of the master data array's - dimensions. - - **Examples:** - - >>> pm.ndim - 2 - >>> pm.size - 32 - >>> pm.shape - (8, 4) - - >>> pm.ndim - 0 - >>> pm.shape - () - - """ - return self.matrix.shape - - @property - def size(self): - """The number of partitions in the partition matrix. - - Not to be confused with the number of elements in the master data - array. - - **Examples:** - - >>> pm.shape - (8, 4) - >>> pm.size - 32 - - >>> pm.shape - () - >>> pm.size - 1 - - """ - return self.matrix.size - - def add_partitions(self, adimensions, master_flip, extra_boundaries, axis): - """Add partition boundaries. - - :Parameters: - - adimensions: `list` - The ordered axis names of the master array. - - master_flip: `list` - - extra_boundaries: `list` of `int` - The boundaries of the new partitions. - - axis: `str` - The name of the axis to have the new partitions. - - """ - - def _update_p( - matrix, - location, - master_index, - part, - master_axis_to_position, - master_flip, - ): - """Create a new partition matrix via an update. - - :Parameters: - - matrix: numpy array of `Partition` objects - - location: `list` - - master_index: `int` - - part: `list` - - master_axis_to_position: `dict` - - master_flip: `list` - - :Returns: - - numpy array of `Partition` objects - - """ - for partition in matrix.flat: - partition.location = partition.location[:] - partition.shape = partition.shape[:] - - partition.location[master_index] = location - partition.shape[master_index] = shape - - partition.new_part(part, master_axis_to_position, master_flip) - # --- End: for - - return matrix - - # If no extra boundaries have been provided, just return - # without doing anything - if not extra_boundaries: - return - - master_index = adimensions.index(axis) - index = self.axes.index(axis) - - # Find the position of the extra-boundaries dimension in the - # list of master array dimensions - extra_boundaries = extra_boundaries[:] - - # Create the master_axis_to_position dictionary required by - # Partition.new_part - master_axis_to_position = {} - for i, data_axis in enumerate(adimensions): - master_axis_to_position[data_axis] = i - - matrix = self.matrix - shape = matrix.shape - - # Initialize the new partition matrix - new_shape = list(shape) - new_shape[index] += len(extra_boundaries) - new_matrix = numpy_empty(new_shape, dtype=object) - - part = [slice(None)] * len(adimensions) - indices = [slice(None)] * matrix.ndim - new_indices = indices[:] - new_indices[index] = slice(0, 1) # 0 - - # Find the first extra boundary - x = extra_boundaries.pop(0) - - for i in range(shape[index]): - indices[index] = slice(i, i + 1) - sub_matrix = matrix[tuple(indices)] - # (r0, r1) = next(sub_matrix.flat).location[master_index] - (r0, r1) = sub_matrix.item( - 0, - ).location[master_index] - - # Could do better, perhaps, by assigning in blocks - if not r0 < x < r1: - # This new boundary is *not* within the span of this - # sub-matrix. Therefore, just copy the sub-matrix - # straight into the new matrix - new_matrix[tuple(new_indices)] = sub_matrix - # new_indices[index] += 1 - new_indices[index] = slice( - new_indices[index].start + 1, new_indices[index].stop + 1 - ) - continue - - # -------------------------------------------------------- - # Still here? Then this new boundary *is* within the span - # of this sub-matrix. - # -------------------------------------------------------- - - # Find the new extent of the original partition(s) - location = (r0, x) - shape = x - r0 - part[master_index] = slice(0, shape) - - # Create new partition(s) in place of the original ones(s) - # and set the location, shape and part attributes - new_matrix[tuple(new_indices)] = _update_p( - deepcopy(sub_matrix), - location, - master_index, - part, - master_axis_to_position, - master_flip, - ) - # new_indices[index] += 1 - new_indices[index] = slice( - new_indices[index].start + 1, new_indices[index].stop + 1 - ) - - while x < r1: - # Find the extent of the new partition(s) - if not extra_boundaries: - # There are no more new boundaries, so the new - # partition(s) run to the end of the original - # partition(s) in which they lie. - location1 = r1 - else: - # There are more new boundaries, so this - # new partition runs either to the next - # new boundary or to the end of the - # original partition, which comes first. - location1 = min(extra_boundaries[0], r1) - - location = (x, location1) - shape = location1 - x - offset = x - r0 - part[master_index] = slice(offset, offset + shape) - - # Create the new partition(s) and set the - # location, shape and part attributes - new_matrix[tuple(new_indices)] = _update_p( - deepcopy(sub_matrix), - location, - master_index, - part, - master_axis_to_position, - master_flip, - ) - - new_indices[index] = slice( - new_indices[index].start + 1, new_indices[index].stop + 1 - ) - # new_indices[index] += 1 - - if not extra_boundaries: - # There are no more extra boundaries, so we can - # return now. - # new_indices[index] = slice(new_indices[index], - # None) - new_indices[index] = slice(new_indices[index].start, None) - indices[index] = slice(i + 1, None) - - new_matrix[tuple(new_indices)] = matrix[tuple(indices)] - self.matrix = new_matrix - - return - - # Still here? Then move on to the next new boundary - x = extra_boundaries.pop(0) - # --- End: while - # --- End: for - - self.matrix = new_matrix - - def copy(self): - """Return a deep copy. - - ``pm.copy()`` is equivalent to ``copy.deepcopy(pm)``. - - :Returns: - - The deep copy. - - **Examples:** - - >>> pm.copy() - - """ - # ------------------------------------------------------------ - # NOTE: 15 May 2013. It is necesary to treat - # self.matrix.ndim==0 as a special case since there is a - # bug (feature?) in numpy <= v1.7 (at least): - # http://numpy-discussion.10968.n7.nabble.com/bug-in-deepcopy-of-rank-zero-arrays-td33705.html - # ------------------------------------------------------------ - matrix = self.matrix - - if not matrix.ndim: - new_matrix = _empty_matrix.copy() # numpy_empty((), dtype=object) - new_matrix[()] = matrix.item().copy() - return type(self)(new_matrix, []) - else: - new_matrix = numpy.empty(matrix.size, dtype=object) - new_matrix[...] = [partition.copy() for partition in matrix.flat] - new_matrix.resize(matrix.shape) - return type(self)(new_matrix, self.axes) - - # 0 - @_inplace_enabled(default=False) - def insert_dimension(self, axis, inplace=False): - """Insert a new size 1 axis in place. - - The new axis is always inserted at position 0, i.e. it becomes the - new slowest varying axis. - - .. seealso:: `flip`, `squeezes`, `swapaxes`, `transpose` - - :Parameters: - - axis: `str` - The internal identity of the new axis. - - :Returns: - - `PartitionMatrix` - - **Examples:** - - >>> pm.shape - (2, 3) - >>> pm.insert_dimension('dim2') - >>> pm.shape - (1, 2, 3) - - """ - p = _inplace_enabled_define_and_cleanup(self) - - p.matrix = numpy_expand_dims(p.matrix, 0) - p.axes = [axis] + p.axes - - return p - - def ndenumerate(self): - """Return an iterator yielding pairs of array indices and - values. - - :Returns: - - `numpy.ndenumerate` - An iterator over the array coordinates and values. - - **Examples:** - - >>> pm.shape - (2, 3) - >>> for i, partition in pm.ndenumerate(): - ... print(i, repr(partition)) - ... - (0, 0) - (0, 1) - (0, 2) - (1, 0) - (1, 1) - (1, 2) - - """ - return numpy_ndenumerate(self.matrix) - - # 0 - def partition_boundaries(self, data_axes): - """Return the partition boundaries for each dimension. - - :Parameters: - - data_axes: sequence - - :Returns: - - `dict` - - """ - boundaries = {} - - matrix = self.matrix - indices = [0] * self.ndim - - for i, axis in enumerate(self.axes): - indices[i] = slice(None) - j = data_axes.index(axis) - - sub_matrix = matrix[tuple(indices)] - - b = [partition.location[j][0] for partition in sub_matrix.flat] - - # Python3: can't access variables from within previous - # list comprehension - last_partition = sub_matrix.flat[-1] - - b.append(last_partition.location[j][1]) - boundaries[axis] = b - - indices[i] = 0 - # --- End: for - - return boundaries - - # 0 - @_inplace_enabled(default=False) - def swapaxes(self, axis0, axis1, inplace=False): - """Swap the positions of two axes. - - Note that this does not change the master data array. - - .. seealso:: `insert_dimension`, `flip`, `squeeze`, `transpose` - - :Parameters: - - axis0, axis1: `int`, `int` - Select the axes to swap. Each axis is identified by its - original integer position. - - :Returns: - - `PartitionMatrix` - - **Examples:** - - >>> pm.shape - (2, 3, 4, 5) - >>> pm.swapaxes(1, 2) - >>> pm.shape - (2, 4, 3, 5) - >>> pm.swapaxes(1, -1) - >>> pm.shape - (2, 5, 3, 4) - - """ - p = _inplace_enabled_define_and_cleanup(self) - - if axis0 != axis1: - iaxes = list(range(p.matrix.ndim)) - iaxes[axis1], iaxes[axis0] = iaxes[axis0], iaxes[axis1] - p.transpose(iaxes, inplace=True) - - return p - - # 0 - def set_location_map(self, data_axes, ns=None): - """Set the `!location` attribute of each partition of the - partition matrix in place. - - :Parameters: - - data_axes: sequence - The axes of the master data array. - - ns: sequence of `int`, optional - - :Returns: - - `None` - - **Examples:** - - >>> pm.set_location_map(['dim1', 'dim0']) - >>> pm.set_location_map([]) - - """ - matrix = self.matrix - - shape = matrix.shape - axes = self.axes - - slice_None = slice(None) - - indices = [slice_None] * matrix.ndim - - # Never update location in-place - for partition in matrix.flat: - partition.location = partition.location[:] - - if ns is None: - ns = range(len(data_axes)) - - for axis, n in zip(data_axes, ns): - - if axis in axes: - # ---------------------------------------------------- - # This data array axis is also a partition matrix axis - # ---------------------------------------------------- - m = axes.index(axis) - start = 0 - for i in range(shape[m]): - indices[m] = slice(i, i + 1) # i - flat = matrix[tuple(indices)].flat - - partition = next(flat) - stop = start + partition.shape[n] - location = (start, stop) - - partition.location[n] = location - - for partition in flat: - partition.location[n] = location - - start = stop - - # --- End: for - indices[m] = slice_None - else: - # ---------------------------------------------------- - # This data array axis is not a partition matrix axis - # ---------------------------------------------------- - flat = matrix.flat - partition = next(flat) - location = (0, partition.shape[n]) - - partition.location[n] = location - - for partition in flat: - partition.location[n] = location - # --- End: for - - # 0 - @_inplace_enabled(default=False) - def squeeze(self, inplace=False): - """Remove all size 1 axes. - - Note that this does not change the master data array. - - .. seealso:: `insert_dimension`, `flip`, `swapaxes`, `transpose` - - :Returns: - - `PartitionMatrix` - - **Examples:** - - >>> pm.shape - (1, 2, 1, 2) - >>> pm.squeeze() - >>> pm.shape - (2, 2) - - >>> pm.shape - (1,) - >>> pm.squeeze() - >>> pm.shape - () - >>> pm.squeeze() - >>> pm.shape - () - - """ - p = _inplace_enabled_define_and_cleanup(self) - - matrix = p.matrix - shape = matrix.shape - - if 1 in shape: - p.matrix = matrix.squeeze() - - axes = p.axes - p.axes = [axis for axis, size in zip(axes, shape) if size > 1] - - return p - - @_inplace_enabled(default=False) - def transpose(self, axes, inplace=False): - """Permute the partition dimensions of the partition matrix in - place. - - Note that this does not change the master data array. - - .. seealso:: `insert_dimension`, `flip`, `squeeze`, `swapaxes` - - :Parameters: - - axes: sequence of `int` - Permute the axes according to the values given. - - :Returns: - - `PartitionMatrix` - - **Examples:** - - >>> pm.ndim - 3 - >>> pm.transpose((2, 0, 1)) - - """ - p = _inplace_enabled_define_and_cleanup(self) - - matrix = p.matrix - if list(axes) != list(range(matrix.ndim)): - p.matrix = matrix.transpose(axes) - p_axes = p.axes - p.axes = [p_axes[i] for i in axes] - - return p - - # ---------------------------------------------------------------- - # Deprecated attributes and methods - # ---------------------------------------------------------------- - def expand_dims(self, axis, inplace=False): - """Insert a new size 1 axis in place. - - Deprecated at version 3.0.0. Use method 'insert_dimension' - instead. - - """ - _DEPRECATION_ERROR_METHOD( - self, "expand_dims", "Use method 'insert_dimension' instead." - ) # pragma: no cover - - -# --- End: class diff --git a/cf/data/raggedcontiguousarray.py b/cf/data/raggedcontiguousarray.py index 607be5d234..4c62f200cb 100644 --- a/cf/data/raggedcontiguousarray.py +++ b/cf/data/raggedcontiguousarray.py @@ -1,7 +1,9 @@ import cfdm +from .mixin import ArrayMixin -class RaggedContiguousArray(cfdm.RaggedContiguousArray): + +class RaggedContiguousArray(ArrayMixin, cfdm.RaggedContiguousArray): """An underlying contiguous ragged array. A collection of features stored using a contiguous ragged array @@ -12,9 +14,21 @@ class RaggedContiguousArray(cfdm.RaggedContiguousArray): The information needed to uncompress the data is stored in a "count variable" that gives the size of each block. + It is assumed that the compressed dimension is the left-most + dimension in the compressed array. + + See CF section 9 "Discrete Sampling Geometries". + .. versionadded:: 3.0.0 """ + def __repr__(self): + """Called by the `repr` built-in function. + + x.__repr__() <==> repr(x) + + .. versionadded:: 3.0.0 -# --- End: class + """ + return super().__repr__().replace("<", " x[indices] - - Returns a numpy array. - - """ - # The compressed array - array = self.array - - # Initialize the full, uncompressed output array with missing - # data everywhere - uarray = numpy_ma_masked_all(self.shape, dtype=array.dtype) - - p_indices = [slice(None)] * uarray.ndim - - compression = self.compression - instance_axis = compression["instance_axis"] - instance_index = compression["instance_index"] - element_axis = compression["c_element_axis"] - sample_indices = compression["c_element_indices"] - - p_indices[instance_axis] = instance_index - p_indices[element_axis] = slice( - 0, sample_indices.stop - sample_indices.start - ) - - uarray[tuple(p_indices)] = array[sample_indices] - - logger.debug("instance_axis = {}".format(instance_axis)) - logger.debug("instance_index = {}".format(instance_index)) - logger.debug("element_axis = {}".format(element_axis)) - logger.debug("sample_indices = {}".format(sample_indices)) - logger.debug("p_indices = {}".format(p_indices)) - logger.debug("uarray.shape = {}".format(uarray.shape)) - logger.debug("self.array.shape = {}".format(array.shape)) - - if indices is Ellipsis: - return uarray - else: - logger.debug("indices = {}".format(indices)) - - indices = parse_indices(self.shape, indices) - logger.debug( - "parse_indices(self.shape, indices) = {}".format(indices) - ) - - return get_subspace(uarray, indices) - - -# --- End: class diff --git a/cf/data/raggedindexedarray.py b/cf/data/raggedindexedarray.py index b1435f9ff4..fe9bc9f39b 100644 --- a/cf/data/raggedindexedarray.py +++ b/cf/data/raggedindexedarray.py @@ -1,7 +1,9 @@ import cfdm +from .mixin import ArrayMixin -class RaggedIndexedArray(cfdm.RaggedIndexedArray): + +class RaggedIndexedArray(ArrayMixin, cfdm.RaggedIndexedArray): """An underlying indexed ragged array. A collection of features stored using an indexed ragged array @@ -13,9 +15,21 @@ class RaggedIndexedArray(cfdm.RaggedIndexedArray): "index variable" that specifies the feature that each element of the sample dimension belongs to. + It is assumed that the compressed dimension is the left-most + dimension in the compressed array. + + See CF section 9 "Discrete Sampling Geometries". + .. versionadded:: 3.0.0 """ + def __repr__(self): + """Called by the `repr` built-in function. + + x.__repr__() <==> repr(x) + + .. versionadded:: 3.0.0 -# --- End: class + """ + return super().__repr__().replace("<", " repr(x) + + .. versionadded:: 3.0.0 -# --- End: class + """ + return super().__repr__().replace("<", " x[indices] - - Returns a numpy array. - - """ - # The compressed array - array = self.array - - # Initialize the full, uncompressed output array with missing - # data everywhere - uarray = numpy.ma.masked_all(self.shape, dtype=array.dtype) - - p_indices = [slice(None)] * uarray.ndim - - compression = self.compression - - instance_axis = compression["instance_axis"] - instance_index = compression["instance_index"] - i_element_axis = compression["i_element_axis"] - i_element_index = compression["i_element_index"] - c_element_axis = compression["c_element_axis"] - c_element_indices = compression["c_element_indices"] - - p_indices[instance_axis] = instance_index - p_indices[i_element_axis] = i_element_index - p_indices[c_element_axis] = slice( - 0, c_element_indices.stop - c_element_indices.start - ) - - uarray[tuple(p_indices)] = array[c_element_indices] - - if indices is Ellipsis: - return uarray - else: - logger.debug("indices = {}".format(indices)) - - indices = parse_indices(self.shape, indices) - logger.debug( - "parse_indices(self.shape, indices) = {}".format(indices) - ) - - return get_subspace(uarray, indices) - - -# def __repr__(self): -# '''x.__repr__() <==> repr(x) -# -# ''' -# return "" % (self.__class__.__name__, str(self.array)) - - -# --- End: class diff --git a/cf/data/raggedindexedsubarray.py b/cf/data/raggedindexedsubarray.py deleted file mode 100644 index 67e9dcc646..0000000000 --- a/cf/data/raggedindexedsubarray.py +++ /dev/null @@ -1,69 +0,0 @@ -import logging - -import numpy - -from ..functions import get_subspace, parse_indices -from . import abstract - -logger = logging.getLogger(__name__) - - -class RaggedIndexedSubarray(abstract.CompressedSubarray): - """An underlying indexed ragged sub-array.""" - - def __getitem__(self, indices): - """x.__getitem__(indices) <==> x[indices] - - Returns a numpy array. - - """ - # The compressed array - array = self.array - - # Initialize the full, uncompressed output array with missing - # data everywhere - uarray = numpy.ma.masked_all(self.shape, dtype=array.dtype) - - p_indices = [slice(None)] * uarray.ndim - - compression = self.compression - - instance_axis = compression["instance_axis"] - instance_index = compression["instance_index"] - element_axis = compression["i_element_axis"] - sample_indices = compression["i_element_indices"] - - p_indices[instance_axis] = instance_index - p_indices[element_axis] = slice(0, len(sample_indices)) - - uarray[tuple(p_indices)] = array[sample_indices] - - logger.debug("instance_axis = {}".format(instance_axis)) - logger.debug("instance_index = {}".format(instance_index)) - logger.debug("element_axis = {}".format(element_axis)) - logger.debug("sample_indices = {}".format(sample_indices)) - logger.debug("p_indices = {}".format(p_indices)) - logger.debug("uarray.shape = {}".format(uarray.shape)) - logger.debug("self.array.shape = {}".format(array.shape)) - - if indices is Ellipsis: - return uarray - else: - logger.debug("indices = {}".format(indices)) - - indices = parse_indices(self.shape, indices) - logger.debug( - "parse_indices(self.shape, indices) = {}".format(indices) - ) - - return get_subspace(uarray, indices) - - -# def __repr__(self): -# '''x.__repr__() <==> repr(x) -# -# ''' -# return "" % (self.__class__.__name__, str(self.array)) - - -# --- End: class diff --git a/cf/data/subsampledarray.py b/cf/data/subsampledarray.py new file mode 100644 index 0000000000..91597fd4f5 --- /dev/null +++ b/cf/data/subsampledarray.py @@ -0,0 +1,80 @@ +import cfdm + +from .mixin import ArrayMixin + + +class SubsampledArray(ArrayMixin, cfdm.SubsampledArray): + """An underlying subsampled array. + + For some structured coordinate data (e.g. coordinates describing + remote sensing products) space may be saved by storing a subsample + of the data, called tie points. The uncompressed data can be + reconstituted by interpolation, from the subsampled values. This + process will likely result in a loss in accuracy (as opposed to + precision) in the uncompressed variables, due to rounding and + approximation errors in the interpolation calculations, but it is + assumed that these errors will be small enough to not be of + concern to users of the uncompressed dataset. The creator of the + compressed dataset can control the accuracy of the reconstituted + data through the degree of subsampling and the choice of + interpolation method. + + See CF section 8.3 "Lossy Compression by Coordinate Subsampling" + and Appendix J "Coordinate Interpolation Methods". + + >>> tie_point_indices={{package}}.TiePointIndex(data=[0, 4, 7, 8, 11]) + >>> w = {{package}}.InterpolationParameter(data=[5, 10, 5]) + >>> coords = {{package}}.SubsampledArray( + ... interpolation_name='quadratic', + ... compressed_array={{package}}.Data([15, 135, 225, 255, 345]), + ... shape=(12,), + ... tie_point_indices={0: tie_point_indices}, + ... parameters={"w": w}, + ... parameter_dimensions={"w": (0,)}, + ... ) + >>> print(coords[...]) + [ 15. 48.75 80. 108.75 135. + 173.88888889 203.88888889 225. 255. 289.44444444 + 319.44444444 345. ] + + **Cell boundaries** + + When the tie points array represents bounds tie points then the + *shape* parameter describes the uncompressed bounds shape. See CF + section 8.3.9 "Interpolation of Cell Boundaries". + + >>> bounds = {{package}}.SubsampledArray( + ... interpolation_name='quadratic', + ... compressed_array={{package}}.Data([0, 150, 240, 240, 360]), + ... shape=(12, 2), + ... tie_point_indices={0: tie_point_indices}, + ... parameters={"w": w}, + ... parameter_dimensions={"w": (0,)}, + ... ) + >>> print(bounds[...]) + [[0.0 33.2] + [33.2 64.8] + [64.8 94.80000000000001] + [94.80000000000001 123.2] + [123.2 150.0] + [150.0 188.88888888888889] + [188.88888888888889 218.88888888888889] + [218.88888888888889 240.0] + [240.0 273.75] + [273.75 305.0] + [305.0 333.75] + [333.75 360.0]] + + .. versionadded:: TODODASK + + """ + + def __repr__(self): + """Called by the `repr` built-in function. + + x.__repr__() <==> repr(x) + + .. versionadded:: 3.0.0 + + """ + return super().__repr__().replace("<", ">> a = UMFileArray(file='file.pp', header_offset=3156, data_offset=3420, - ... dtype=numpy.dtype('float32'), shape=(30, 24), - ... size=720, ndim=2, disk_length=0) + >>> a = UMFileArray(file='file.pp', header_offset=3156, + ... data_offset=3420, + ... dtype=numpy.dtype('float32'), + ... shape=(1, 1, 30, 24), + ... size=720, ndim=4, disk_length=0) >>> a = UMFileArray( - ... file='packed_file.pp', header_offset=3156, data_offset=3420, + ... file='packed_file.pp', header_offset=3156, + ... data_offset=3420, ... dtype=numpy.dtype('float32'), shape=(30, 24), ... size=720, ndim=2, disk_length=423 - ... ) + ... ) """ super().__init__( filename=filename, dtype=dtype, - ndim=ndim, shape=shape, - size=size, header_offset=header_offset, data_offset=data_offset, disk_length=disk_length, @@ -89,17 +96,15 @@ def __init__( byte_ordering=byte_ordering, ) - # By default, do not close the UM file after data array access - self._close = False + # By default, close the UM file after data array access + self._set_component("close", True, copy=False) def __getitem__(self, indices): - """Implement indexing. + """Return a subspace of the array. x.__getitem__(indices) <==> x[indices] - :Returns: - - `numpy.ndarray` + Returns a subspace of the array as an independent numpy array. """ f = self.open() @@ -111,22 +116,13 @@ def __getitem__(self, indices): int_hdr = rec.int_hdr real_hdr = rec.real_hdr - array = rec.get_data().reshape( - int_hdr.item( - 17, - ), - int_hdr.item( - 18, - ), - ) + array = rec.get_data().reshape(self.shape) if indices is not Ellipsis: indices = parse_indices(array.shape, indices) array = get_subspace(array, indices) - LBUSER2 = int_hdr.item( - 38, - ) + LBUSER2 = int_hdr.item(38) if LBUSER2 == 3: # Return the numpy array now if it is a boolean array @@ -138,9 +134,7 @@ def __getitem__(self, indices): # Convert to a masked array # ------------------------------------------------------------ # Set the fill_value from BMDI - fill_value = real_hdr.item( - 17, - ) + fill_value = real_hdr.item(17) if fill_value != -1.0e30: # -1.0e30 is the flag for no missing data if integer_array: @@ -151,31 +145,30 @@ def __getitem__(self, indices): # Mask any missing values mask = array == fill_value if mask.any(): - array = numpy.ma.masked_where(mask, array, copy=False) - # --- End: if + array = np.ma.masked_where(mask, array, copy=False) # ------------------------------------------------------------ # Unpack the array using the scale_factor and add_offset, if # either is available # ------------------------------------------------------------ # Treat BMKS as a scale_factor if it is neither 0 nor 1 - scale_factor = real_hdr.item( - 18, - ) + scale_factor = real_hdr.item(18) if scale_factor != 1.0 and scale_factor != 0.0: if integer_array: scale_factor = int(scale_factor) + array *= scale_factor # Treat BDATUM as an add_offset if it is not 0 - add_offset = real_hdr.item( - 4, - ) + add_offset = real_hdr.item(4) if add_offset != 0.0: if integer_array: add_offset = int(add_offset) + array += add_offset + self.close(f) + # Return the numpy array return array @@ -187,15 +180,23 @@ def __str__(self): `str` """ - return "%s%s in %s" % (self.header_offset, self.shape, self.filename) + return f"{self.header_offset}{self.shape} in {self.filename}" @property - def file_pointer(self): - """The file pointer starting at the position of the header. + def file_address(self): + """The file name and address. + + .. versionadded:: (cfdm) 1.9.TODO.0 :Returns: - 2-`tuple` + `tuple` + The file name and file address. + + **Examples** + + >>> a.file_address() + ('file.pp', 234835) """ return (self.filename, self.header_offset) @@ -269,40 +270,49 @@ def word_size(self): """ return self._get_component("word_size") - def close(self): - """Close the file containing the data array. + def close(self, um): + """Close the dataset containing the data. - If the file is not open then no action is taken. + :Parameters: - :Returns: + um: `umfile_lib.File` + The UM or PP dataset to be be closed. - `None` + .. versionadded:: TODODASK - **Examples:** + :Returns: - >>> f.close() + `None` """ - _close_um_file(self.filename) + if self._get_component("close"): + um.close_fd() def open(self): - """Open the file containing the data array. + """Returns an open dataset containing the data array. :Returns: `umfile_lib.File` - **Examples:** + **Examples** >>> f.open() """ - return _open_um_file( - self.filename, - fmt=self.fmt, - word_size=self.word_size, - byte_ordering=self.byte_ordering, - ) - - -# --- End: class + try: + f = File( + path=self.filename, + byte_ordering=self.byte_ordering, + word_size=self.word_size, + fmt=self.fmt, + ) + except Exception as error: + try: + f.close_fd() + except Exception: + pass + + raise Exception(error) + else: + return f diff --git a/cf/data/utils.py b/cf/data/utils.py new file mode 100644 index 0000000000..5f2f64c3d9 --- /dev/null +++ b/cf/data/utils.py @@ -0,0 +1,482 @@ +"""General functions useful for `Data` functionality.""" +from functools import lru_cache, partial +from itertools import product + +import dask.array as da +import numpy as np + +from ..cfdatetime import dt as cf_dt +from ..cfdatetime import dt2rt, rt2dt, st2rt +from ..units import Units + + +def convert_to_datetime(array, units): + """Convert a daskarray to. + + .. versionadded:: 4.0.0 + + :Parameters: + + array: dask array + + units : `Units` + + :Returns: + + dask array + A new dask array containing datetime objects. + + """ + dx = array.map_blocks(partial(rt2dt, units_in=units), dtype=object) + return dx + + +def convert_to_reftime(array, units, first_value=None): + """Convert a dask array of string or object date-times to floating + point reference times. + + .. versionadded:: 4.0.0 + + :Parameters: + + array: dask array + + units : `Units` + + first_value : scalar, optional + + :Returns: + + dask array, `Units` + A new dask array containing reference times, and its + units. + + """ + kind = array.dtype.kind + if kind in "US": + # Convert date-time strings to reference time floats + if not units: + value = first_value(array, first_value) + if value is not None: + YMD = str(value).partition("T")[0] + else: + YMD = "1970-01-01" + + units = Units("days since " + YMD, units._calendar) + + array = array.map_blocks( + partial(st2rt, units_in=units, units_out=units), dtype=float + ) + + elif kind == "O": + # Convert date-time objects to reference time floats + value = first_value(array, first_value) + if value is not None: + x = value + else: + x = cf_dt(1970, 1, 1, calendar="gregorian") + + x_since = "days since " + "-".join(map(str, (x.year, x.month, x.day))) + x_calendar = getattr(x, "calendar", "gregorian") + + d_calendar = getattr(units, "calendar", None) + d_units = getattr(units, "units", None) + + if x_calendar != "": + if d_calendar is not None: + if not units.equivalent(Units(x_since, x_calendar)): + raise ValueError( + f"Incompatible units: " + f"{units!r}, {Units(x_since, x_calendar)!r}" + ) + else: + d_calendar = x_calendar + + if not units: + # Set the units to something that is (hopefully) + # close to all of the datetimes, in an attempt to + # reduce errors arising from the conversion to + # reference times + units = Units(x_since, calendar=d_calendar) + else: + units = Units(d_units, calendar=d_calendar) + + # Check that all date-time objects have correct and + # equivalent calendars + calendars = unique_calendars(array) + if len(calendars) > 1: + raise ValueError( + "Not all date-time objects have equivalent " + f"calendars: {tuple(calendars)}" + ) + + # If the date-times are calendar-agnostic, assign the + # given calendar, defaulting to Gregorian. + if calendars.pop() == "": + calendar = getattr(units, "calendar", "gregorian") + + # TODODASK: can map_blocks this, I think + new_array = da.empty_like(array, dtype=object) + for i in np.ndindex(new_array.shape): + new_array[i] = cf_dt(array[i], calendar=calendar) + + array = new_array + + # Convert the date-time objects to reference times + array = array.map_blocks(dt2rt, units_out=units, dtype=float) + + if not units.isreftime: + raise ValueError( + f"Can't create a reference time array with units {units!r}" + ) + + return array, units + + +def first_non_missing_value(array, cached=None): + """Return the first non-missing value of an array. + + If the array contains only missing data then `None` is returned. + + If a cached value is provided then that is returned without + looking for the actual first non-missing value. + + .. versionadded:: 4.0.0 + + :Parameters: + + array: dask array + The array to be inspected. + + cached: scalar, optional + If set to a value other than `Ǹone`, then return this value + instead of inspecting the array. + + :Returns: + + If the *cached* parameter is set then its value is + returned. Otherwise return the first non-missing value, or + `None` if there isn't one. + + """ + if cached is not None: + return cached + + # This does not look particularly efficient, but the expectation + # is that the first element in the array will not be missing data. + + shape = array.shape + for i in range(array.size): + index = np.unravel_index(i, shape) + x = array[index].compute() + if x is np.ma.masked: + continue + + return x.item() + + return None + + +def unique_calendars(array): + """Find the unique calendars from a dask array of date-time objects. + + .. versionadded:: 4.0.0 + + :Returns: + + `set` + The unique calendars. + + """ + + def _get_calendar(x): + getattr(x, "calendar", "gregorian") + + _calendars = np.vectorize(_get_calendar, otypes=[np.dtype(str)]) + + array = array.map_blocks(_calendars, dtype=str) + + cals = da.unique(array).compute() + if np.ma.isMA(cals): + cals = cals.compressed() + + # TODODASK - need to allow differetn bu equivalent calendars, such + # as "gregorian" and 'standard'. Or perhaps this should by the + # caller? + + return set(cals.tolist()) + + +@lru_cache(maxsize=32) +def new_axis_identifier(existing_axes=(), basename="dim"): + """Return a new, unique axis identifier. + + The name is arbitrary and has no semantic meaning. + + .. versionadded:: TODODASK + + :Parameters: + + existing_axes: sequence of `str`, optional + Any existing axis names that are not to be duplicated. + + basename: `str`, optional + The root of the new axis identifier. The new axis + identifier will be this root followed by an integer. + + :Returns: + + `str` + The new axis idenfifier. + + **Examples:** + + >>> new_axis_identifier() + 'dim0' + >>> new_axis_identifier(['dim0']) + 'dim1' + >>> new_axis_identifier(['dim3']) + 'dim1' + >>> new_axis_identifier(['dim1']) + 'dim2' + >>> new_axis_identifier(['dim1', 'dim0']) + 'dim2' + >>> new_axis_identifier(['dim3', 'dim4']) + 'dim2' + >>> new_axis_identifier(['dim2', 'dim0']) + 'dim3' + >>> new_axis_identifier(['dim3', 'dim4', 'dim0']) + 'dim5' + >>> d._new_axis_identifier(basename='axis') + 'axis0' + >>> d._new_axis_identifier(basename='axis') + 'axis0' + >>> d._new_axis_identifier(['dim0'], basename='axis') + 'axis1' + >>> d._new_axis_identifier(['dim0', 'dim1'], basename='axis') + 'axis2' + + """ + n = len(existing_axes) + axis = f"{basename}{n}" + while axis in existing_axes: + n += 1 + axis = f"{basename}{n}" + + return axis + + +def chunk_positions(chunks): + """Find the position of each chunk. + + .. versionadded:: TODODASK + + .. seealso:: `chunk_shapes` + + :Parameters: + + chunks: `tuple` + The chunk sizes along each dimension, as output by + `dask.array.Array.chunks`. + + **Examples** + + >>> chunks = ((1, 2), (9,), (44, 55, 66)) + >>> for position in chunk_positions(chunks): + ... print(position) + ... + (0, 0, 0) + (0, 0, 1) + (0, 0, 2) + (1, 0, 0) + (1, 0, 1) + (1, 0, 2) + + """ + return product(*(range(len(bds)) for bds in chunks)) + + +def chunk_shapes(chunks): + """Find the shape of each chunk. + + .. versionadded:: TODODASK + + .. seealso:: `chunk_positions` + + :Parameters: + + chunks: `tuple` + The chunk sizes along each dimension, as output by + `dask.array.Array.chunks`. + + **Examples** + + >>> chunks = ((1, 2), (9,), (4, 5, 6)) + >>> for shape in chunk_shapes(chunks): + ... print(shape) + ... + (1, 9, 4) + (1, 9, 5) + (1, 9, 6) + (2, 9, 4) + (2, 9, 5) + (2, 9, 6) + + """ + return product(*chunks) + + +def is_small(array, threshold=None): + """TODODASK - need to define what 'small' is, and consider the API + in general + + We adjust the size of the data here for the potiential of a mask + + Returns False if size is unknown + + .. versionadded:: 4.0.0 + """ + if threshold is None: + threshold = 2 ** 90 # TODODASK - True for now! + + return array.size * (array.dtype.itemsize + 1) < threshold + + +def is_very_small(array, threshold=None): + """ + TODODASK - need to define what 'very small' is, and consider the API + in general + + .. versionadded:: 4.0.0 + + """ + if threshold is None: + threshold = 0.125 * 2 ** 90 # TODODASK - True for now! + + return is_small(array, threshold) + + +def dask_compatible(a): + """Convert an object to one which is dask compatible. + + The object is returned unchanged unless it is a cf object + containing data, in which case the dask array of the data is + returned instead. + + .. versionadded:: 4.0.0 + + """ + try: + return a.data.get_dask(copy=False) + except AttributeError: + return a + + +def scalar_masked_array(dtype=float): + """Return a scalar masked array. + + .. versionadded:: TODODASK + + :Parmaeters: + + dtype: data-type, optional + Desired output data-type for the array, e.g, + `numpy.int8`. Default is `numpy.float64`. + + :Returns: + + `np.ma.core.MaskedArray` + The scalar masked array. + + **Examples** + + >>> scalar_masked_array() + masked_array(data=--, + mask=True, + fill_value=1e+20, + dtype=float64) + >>> scalar_masked_array(dtype('int32')) + masked_array(data=--, + mask=True, + fill_value=999999, + dtype=int32) + >>> scalar_masked_array('U45') + masked_array(data=--, + mask=True, + fill_value='N/A', + dtype='>> scalar_masked_array(bool) + masked_array(data=--, + mask=True, + fill_value=True, + dtype=bool) + + """ + a = np.ma.empty((), dtype=dtype) + a.mask = True + return a + + +def conform_units(value, units): + """Conform units. + + If *value* has units defined by its `Units` attribute then + + * if the value units are equal to *units* then *value* is returned + unchanged; + + * if the value units are equivalent to *units* then a copy of + *value* converted to *units* is returned; + + * if the value units are not equivalent to *units* then an + exception is raised. + + In all other cases *value* is returned unchanged. + + .. versionadded:: TODODASK + + :Parameters: + + value: + The value whose units are to be conformed to *units*. + + units: `Units` + The units to conform to. + + **Examples:** + + >>> conform_units(1, cf.Units('metres')) + 1 + >>> conform_units([1, 2, 3], cf.Units('metres')) + [1, 2, 3] + >>> import numpy + >>> conform_units(numpy.array([1, 2, 3]), cf.Units('metres')) + array([1, 2, 3]) + >>> conform_units('string', cf.Units('metres')) + 'string' + >>> d = cf.Data([1, 2] , 'm') + >>> conform_units(d, cf.Units('metres')) + + >>> d = cf.Data([1, 2] , 'km') + >>> conform_units(d, cf.Units('metres')) + + >>> conform_units(d, cf.Units('s')) + ... + ValueError: Units are incompatible with units + + """ + try: + value_units = value.Units + except AttributeError: + pass + else: + if value_units.equivalent(units): + if value_units != units: + value = value.copy() + value.Units = units + elif value_units and units: + raise ValueError( + f"Units {value_units!r} are incompatible with units {units!r}" + ) + + return value diff --git a/cf/dimensioncoordinate.py b/cf/dimensioncoordinate.py index a20c1de2e4..0e84f8bb18 100644 --- a/cf/dimensioncoordinate.py +++ b/cf/dimensioncoordinate.py @@ -116,28 +116,14 @@ def _infer_direction(self): # Infer the direction from the data if data._size > 1: data = data[0:2].array - return bool( - data.item( - 0, - ) - < data.item( - 1, - ) - ) + return bool(data.item(0) < data.item(1)) # Still here? data = self.get_bounds_data(None, _fill_value=False) if data is not None: # Infer the direction from the bounds b = data[(0,) * (data.ndim - 1)].array - return bool( - b.item( - 0, - ) - < b.item( - 1, - ) - ) + return bool(b.item(0) < b.item(1)) # Still here? Then infer the direction from the units. return not self.Units.ispressure @@ -584,27 +570,9 @@ def create_bounds( "Can't create bounds for Voronoi cells from one value" ) - bounds_1d = [ - array.item( - 0, - ) - * 1.5 - - array.item( - 1, - ) - * 0.5 - ] + bounds_1d = [array.item(0) * 1.5 - array.item(1) * 0.5] bounds_1d.extend((array[0:-1] + array[1:]) * 0.5) - bounds_1d.append( - array.item( - -1, - ) - * 1.5 - - array.item( - -2, - ) - * 0.5 - ) + bounds_1d.append(array.item(-1) * 1.5 - array.item(-2) * 0.5) dtype = type(bounds_1d[0]) @@ -628,29 +596,13 @@ def create_bounds( array = array[::-1] bounds_1d = [bound] - if bound <= array.item( - 0, - ): + if bound <= array.item(0): for i in range(size): - bound = ( - 2.0 - * array.item( - i, - ) - - bound - ) + bound = 2.0 * array.item(i) - bound bounds_1d.append(bound) - elif bound >= array.item( - -1, - ): + elif bound >= array.item(-1): for i in range(size - 1, -1, -1): - bound = ( - 2.0 - * array.item( - i, - ) - - bound - ) + bound = 2.0 * array.item(i) - bound bounds_1d.append(bound) bounds_1d = bounds_1d[::-1] diff --git a/cf/field.py b/cf/field.py index d40ed264f4..e51df06a15 100644 --- a/cf/field.py +++ b/cf/field.py @@ -227,12 +227,7 @@ # -------------------------------------------------------------------- # These Data methods may specify a number of degrees of freedom # -------------------------------------------------------------------- -_collapse_ddof_methods = set( - ( - "sd", - "var", - ) -) +_collapse_ddof_methods = set(("sd", "var")) _earth_radius = Data(6371229.0, "m") @@ -460,13 +455,9 @@ def __getitem__(self, indices): f"{self.constructs.domain_axis_identity(_)!r} axis" ) - logger.debug( - f" roll, iaxis, shift = {roll} {iaxis} {shift}" - ) # pragma: no cover - - new = new.roll(iaxis, shift) + new = new.roll(shift=shift, axis=iaxis) else: - new = self.copy() + new = self.copy(array=False) # ------------------------------------------------------------ # Subspace the field construct's data @@ -482,7 +473,7 @@ def __getitem__(self, indices): logger.debug(" indices2 = {}".format(indices2)) # pragma: no cover logger.debug(" findices = {}".format(findices)) # pragma: no cover - new_data = new.data[tuple(findices)] + new_data = data[tuple(findices)] # Set sizes of domain axes data_axes = new.get_data_axes() @@ -1995,9 +1986,7 @@ def _binary_operation(self, other, method): coord_type = None key, coord = f.dimension_coordinate( - item=True, - default=(None, None), - filter_by_axis=(axis,), + item=True, default=(None, None), filter_by_axis=(axis,) ) if coord is not None: # This axis of the domain has a dimension @@ -8624,8 +8613,7 @@ def collapse( # filter_by_axis=(axis,), axis_mode="and", todict=Tru#e # ).value(None) dc = f.dimension_coordinate( - filter_by_axis=(axis,), - default=None, + filter_by_axis=(axis,), default=None ) if dc is not None and not dc.has_bounds(): dc.set_bounds(dc.create_bounds(cellsize=0)) @@ -8902,11 +8890,7 @@ def collapse( # which span this axis # c = auxiliary_coordinates.filter_by_naxes(gt(1), view=True) c = f.auxiliary_coordinates( - filter_by_naxes=( - gt( - 1, - ), - ), + filter_by_naxes=(gt(1),), filter_by_axis=(axis,), axis_mode="or", todict=True, diff --git a/cf/formula_terms.py b/cf/formula_terms.py index dcda4b7376..c2744fcd28 100644 --- a/cf/formula_terms.py +++ b/cf/formula_terms.py @@ -638,7 +638,7 @@ def _check_standard_name_consistency( "Terms {} have no standard names. " "See Appendix D: Parametric Vertical Coordinates " "of the CF conventions.".format( - ", ".join(repr(term) for term in kwargs), + ", ".join(repr(term) for term in kwargs) ) ) @@ -647,7 +647,7 @@ def _check_standard_name_consistency( "Terms {} have incompatible standard names. " "See Appendix D: Parametric Vertical Coordinates " "of the CF conventions.".format( - ", ".join(repr(term) for term in kwargs), + ", ".join(repr(term) for term in kwargs) ) ) diff --git a/cf/functions.py b/cf/functions.py index 42a6a8711e..f45ca37429 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -13,6 +13,7 @@ from hashlib import md5 as hashlib_md5 from marshal import dumps as marshal_dumps from math import ceil as math_ceil +from numbers import Integral from os import getpid, listdir, mkdir from os.path import abspath as _os_path_abspath from os.path import dirname as _os_path_dirname @@ -29,17 +30,11 @@ from numpy import __version__ as _numpy__version__ from numpy import all as _numpy_all from numpy import allclose as _x_numpy_allclose -from numpy import array as _numpy_array from numpy import ascontiguousarray as _numpy_ascontiguousarray -from numpy import integer as _numpy_integer from numpy import isclose as _x_numpy_isclose -from numpy import ndim as _numpy_ndim from numpy import shape as _numpy_shape -from numpy import sign as _numpy_sign -from numpy import size as _numpy_size from numpy import take as _numpy_take from numpy import tile as _numpy_tile -from numpy import where as _numpy_where from numpy.ma import all as _numpy_ma_all from numpy.ma import allclose as _numpy_ma_allclose from numpy.ma import is_masked as _numpy_ma_is_masked @@ -1877,10 +1872,11 @@ def _numpy_isclose(a, b, rtol=None, atol=None): return a == b -def parse_indices( - shape, indices, cyclic=False, reverse=False, envelope=False, mask=False -): - """TODO. +# TODODASK - sort out the "numpy" environment + + +def parse_indices(shape, indices, cyclic=False, keepdims=True): + """TODODASK. :Parameters: @@ -1888,33 +1884,31 @@ def parse_indices( indices: `tuple` (not a `list`!) + keepdims: `bool`, optional + If True then an integral index is converted to a + slice. For instance, ``3`` would become ``slice(3, 4)``. + :Returns: `list` [, `dict`] + The parsed indices. If *cyclic* is True the a dictionary + is also returned that contains the parameters needed to + interpret any cyclic slices. **Examples:** >>> cf.parse_indices((5, 8), ([1, 2, 4, 6],)) [array([1, 2, 4, 6]), slice(0, 8, 1)] - >>> cf.parse_indices((5, 8), ([2, 4, 6],)) - [slice(2, 7, 2), slice(0, 8, 1)] + >>> cf.parse_indices((5, 8), (Ellipsis, [2, 4, 6])) + [slice(0, 5, 1), slice(2, 7, 2)] """ parsed_indices = [] roll = {} - flip = [] - compressed_indices = [] - mask_indices = [] if not isinstance(indices, tuple): indices = (indices,) - if mask and indices: - arg0 = indices[0] - if isinstance(arg0, str) and arg0 == "mask": - mask_indices = indices[1] - indices = indices[2:] - # Initialize the list of parsed indices as the input indices with any # Ellipsis objects expanded length = len(indices) @@ -1935,36 +1929,20 @@ def parse_indices( if ndim and len_parsed_indices > ndim: raise IndexError( - "Invalid indices {} for array with shape {}".format( - parsed_indices, shape - ) + f"Invalid indices {parsed_indices} for array with shape {shape}" ) if len_parsed_indices < ndim: parsed_indices.extend([slice(None)] * (ndim - len_parsed_indices)) if not ndim and parsed_indices: - # # If data is scalar then allow it to be indexed with an - # # equivalent to [0] - # if (len_parsed_indices == 1 and - # parsed_indices[0] in (0, - # -1, - # slice(0, 1), - # slice(-1, None, -1), - # slice(None, None, None))): - # parsed_indices = [] - # else: raise IndexError( "Scalar array can only be indexed with () or Ellipsis" ) for i, (index, size) in enumerate(zip(parsed_indices, shape)): - is_slice = False - if isinstance(index, slice): - # -------------------------------------------------------- - # Index is a slice - # -------------------------------------------------------- - is_slice = True + if cyclic and isinstance(index, slice): + # Check for a cyclic slice start = index.start stop = index.stop step = index.step @@ -2014,11 +1992,8 @@ def parse_indices( # -9:0:1 => [1, 2, 3, 4, 5, 6, 7, 8, 9] # -9:1:1 => [1, 2, 3, 4, 5, 6, 7, 8, 9, 0] # -10:0:1 => [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - if cyclic: - index = slice(0, stop - start, step) - roll[i] = -start - else: - index = slice(start, stop, step) + index = slice(0, stop - start, step) + roll[i] = -start elif step < 0 and 0 <= start < size and start - size <= stop < 0: # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] @@ -2028,204 +2003,22 @@ def parse_indices( # 6:-4:-1 => [6, 5, 4, 3, 2, 1, 0, 9, 8, 7] # 0:-2:-1 => [0, 9] # 0:-10:-1 => [0, 9, 8, 7, 6, 5, 4, 3, 2, 1] - if cyclic: - index = slice(start - stop - 1, None, step) - roll[i] = -1 - stop - else: - index = slice(start, stop, step) + index = slice(start - stop - 1, None, step) + roll[i] = -1 - stop + elif keepdims and isinstance(index, Integral): + # Convert an integral index to a slice + if index == -1: + index = slice(-1, None, None) else: - start, stop, step = index.indices(size) - if ( - start == stop - or (start < stop and step < 0) - or (start > stop and step > 0) - ): - raise IndexError( - "Invalid indices dimension with size {}: {}".format( - size, index - ) - ) - - if step < 0 and stop < 0: - stop = None - index = slice(start, stop, step) - - elif isinstance(index, (int, _numpy_integer)): - # -------------------------------------------------------- - # Index is an integer - # -------------------------------------------------------- - if index < 0: - index += size - - index = slice(index, index + 1, 1) - is_slice = True - else: - convert2positve = True - if getattr( - getattr(index, "dtype", None), "kind", None - ) == "b" or isinstance(index[0], bool): - # ---------------------------------------------------- - # Index is a sequence of booleans - # ---------------------------------------------------- - # Convert booleans to non-negative integers. We're - # assuming that anything with a dtype attribute also - # has a size attribute. - if _numpy_size(index) != size: - raise IndexError( - "Incorrect number ({}) of boolean indices for " - "dimension with size {}: {}".format( - _numpy_size(index), size, index - ) - ) - - index = _numpy_where(index)[0] - convert2positve = False - - if not _numpy_ndim(index): - if index < 0: - index += size - index = slice(index, index + 1, 1) - is_slice = True - else: - len_index = len(index) - if len_index == 1: - index = index[0] - if index < 0: - index += size - - index = slice(index, index + 1, 1) - is_slice = True - elif len_index: - if convert2positve: - # Convert to non-negative integer numpy array - index = _numpy_array(index) - index = _numpy_where(index < 0, index + size, index) - - steps = index[1:] - index[:-1] - step = steps[0] - if step and not (steps - step).any(): - # Replace the numpy array index with a slice - if step > 0: - start, stop = index[0], index[-1] + 1 - elif step < 0: - start, stop = index[0], index[-1] - 1 - - if stop < 0: - stop = None - - index = slice(start, stop, step) - is_slice = True - else: - if ( - (step > 0 and (steps <= 0).any()) - or (step < 0 and (steps >= 0).any()) - or not step - ): - raise ValueError( - "Bad index (not strictly monotonic): " - "{}".format(index) - ) - - if reverse and step < 0: - # The array is strictly monotonically - # decreasing, so reverse it so that it's - # strictly monotonically increasing. Make - # a note that this dimension will need - # flipping later - index = index[::-1] - flip.append(i) - step = -step - - if envelope: - # Create an envelope slice for a parsed - # index of a numpy array of integers - compressed_indices.append(index) - - step = _numpy_sign(step) - if step > 0: - stop = index[-1] + 1 - else: - stop = index[-1] - 1 - if stop < 0: - stop = None - - index = slice(index[0], stop, step) - is_slice = True - else: - raise IndexError( - "Invalid indices {} for array with shape {}".format( - parsed_indices, shape - ) - ) - - if is_slice: - if reverse and index.step < 0: - # If the slice step is negative, then transform - # the original slice to a new slice with a - # positive step such that the result of the new - # slice is the reverse of the result of the - # original slice. - # - # For example, if the original slice is - # slice(6,0,-2) then the new slice will be - # slice(2,7,2): - # - # >>> a = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - # >>> a[slice(6, 0, -2)] - # [6, 4, 2] - # >>> a[slice(2, 7, 2)] - # [2, 4, 6] - # a[slice(6, 0, -2)] == list(reversed(a[slice(2, 7, 2)])) - # True - start, stop, step = index.indices(size) - step *= -1 - div, mod = divmod(start - stop - 1, step) - div_step = div * step - start -= div_step - stop = start + div_step + 1 - - index = slice(start, stop, step) - flip.append(i) - - # If step is greater than one then make sure that - # index.stop isn't bigger than it needs to be - if cyclic and index.step > 1: - start, stop, step = index.indices(size) - div, mod = divmod(stop - start - 1, step) - stop = start + div * step + 1 - index = slice(start, stop, step) - - # - if envelope: - # Create an envelope slice for a parsed - # index of a numpy array of integers - compressed_indices.append(index) - index = slice( - start, stop, (1 if reverse else _numpy_sign(step)) - ) parsed_indices[i] = index - if not (cyclic or reverse or envelope or mask): + if not cyclic: return parsed_indices - out = [parsed_indices] - - if cyclic: - out.append(roll) - - if reverse: - out.append(flip) - - if envelope: - out.append(compressed_indices) - - if mask: - out.append(mask_indices) - - return out + return parsed_indices, roll def get_subspace(array, indices): diff --git a/cf/mixin/fielddomain.py b/cf/mixin/fielddomain.py index 7ea96a746f..ffe5b25bc1 100644 --- a/cf/mixin/fielddomain.py +++ b/cf/mixin/fielddomain.py @@ -323,10 +323,7 @@ def _indices(self, mode, data_axes, auxiliary_mask, **kwargs): for identity, value in kwargs.items(): key, construct = self.construct( - identity, - filter_by_data=True, - item=True, - default=(None, None), + identity, filter_by_data=True, item=True, default=(None, None) ) if construct is not None: axes = self.get_data_axes(key) @@ -729,10 +726,7 @@ def _indices(self, mode, data_axes, auxiliary_mask, **kwargs): )[0] # Include the auxiliary mask - indices = { - "indices": indices, - "mask": auxiliary_mask, - } + indices = {"indices": indices, "mask": auxiliary_mask} logger.debug(f" indices = {indices!r}") # pragma: no cover @@ -1320,8 +1314,7 @@ def del_coordinate_reference( return return self._default( - default, - f"Can't identify construct from {identity!r}", + default, f"Can't identify construct from {identity!r}" ) ref = self.del_construct(key) @@ -2853,11 +2846,7 @@ def measure( ): """Alias for `cell_measure`.""" return self.cell_measure( - *identity, - key=key, - default=default, - item=item, - **filter_kwargs, + *identity, key=key, default=default, item=item, **filter_kwargs ) def measures(self, *identities, **filter_kwargs): @@ -2874,11 +2863,7 @@ def ref( ): """Alias for `coordinate_reference`.""" return self.coordinate_reference( - *identity, - key=key, - default=default, - item=item, - **filter_kwargs, + *identity, key=key, default=default, item=item, **filter_kwargs ) def refs(self, *identities, **filter_kwargs): diff --git a/cf/mixin/propertiesdata.py b/cf/mixin/propertiesdata.py index 796190b7c1..c6d8437dcb 100644 --- a/cf/mixin/propertiesdata.py +++ b/cf/mixin/propertiesdata.py @@ -42,6 +42,10 @@ logger = logging.getLogger(__name__) +class DeprecationError(Exception): + pass + + class PropertiesData(Properties): """Mixin class for a data array with metadata.""" @@ -2368,11 +2372,11 @@ def hardmask(self): @property def array(self): - """A numpy array deep copy of the data array. + """A numpy array deep copy of the data. Changing the returned numpy array does not change the data array. - .. seealso:: `data`, `datetime_array`, `varray` + .. seealso:: `data`, `datetime_array`, `dask_array` **Examples:** @@ -2394,19 +2398,38 @@ def array(self): """ data = self.get_data(None) if data is None: - raise AttributeError( - f"{self.__class__.__name__} has no data array" - ) + raise AttributeError(f"{self.__class__.__name__} has no data") return data.array + def dask_array(self, copy=True): + """TODODASK. + + :Parameters: + + copy + + + .. seealso:: `data`, `array`, `datetime_array` + + **Examples:** + + TODODASK + + """ + data = self.get_data(None) + if data is None: + raise AttributeError(f"{self.__class__.__name__} has no data") + + return data.dask_array(copy=copy) + @property def varray(self): - """A numpy array view of the data array. + """A numpy array view of the data. Changing the elements of the returned view changes the data array. - .. seealso:: `array`, `data`, `datetime_array` + .. seealso:: `array`, `data`, `datetime_array`, `dask_array` **Examples:** @@ -2426,13 +2449,15 @@ def varray(self): """ - data = self.get_data(None) - if data is None: - raise AttributeError( - f"{self.__class__.__name__} has no data array" - ) + raise DeprecationError("TODODASK") - return data.varray + # data = self.get_data(None) + # if data is None: + # raise AttributeError( + # f"{self.__class__.__name__} has no data" + # ) + # + # return data.varray @property def isscalar(self): @@ -2507,21 +2532,21 @@ def ceil(self, inplace=False, i=False): delete_props=True, ) - def chunk(self, chunksize=None): - """Partition the data array. - - :Parameters: - - chunksize: `int` - - :Returns: - - `None` - - """ - data = self.get_data(None, _fill_value=False) - if data is not None: - data.chunk(chunksize) + # def chunk(self, chunksize=None): + # '''Partition the data array. + # + # :Parameters: + # + # chunksize: `int` + # + # :Returns: + # + # `None` + # + # ''' + # data = self.get_data(None) + # if data is not None: + # data.chunk(chunksize) @_deprecated_kwarg_check("i") @_inplace_enabled(default=False) diff --git a/cf/query.py b/cf/query.py index d9f05f1a80..8aed2c4aa5 100644 --- a/cf/query.py +++ b/cf/query.py @@ -1,4 +1,5 @@ import logging +from copy import deepcopy from operator import __and__ as operator_and from operator import __or__ as operator_or @@ -273,7 +274,7 @@ def __and__(self, other): new = Q.__new__(Q) new._operator = None - new._compound = (self, other) + new._compound = (self.copy(), other.copy()) new._bitwise_operator = operator_and new._attr = () @@ -332,7 +333,7 @@ def __str__(self): attr = ".".join(self._attr) if not self._compound: - out = f"{attr}({self._operator} " + str(self._value) + out = f"{attr}({self._operator} {self._value})" else: bitwise_operator = repr(self._bitwise_operator) if "and_" in bitwise_operator: @@ -465,7 +466,18 @@ def copy(self): >>> r = q.copy() """ - return self # TODO + Q = type(self) + new = Q.__new__(Q) + + d = self.__dict__.copy() + new.__dict__ = d + + if d["_compound"]: + d["_compound"] = deepcopy(d["_compound"]) + else: + d["_value"] = deepcopy(d["_value"]) + + return new @_display_or_return def dump(self, display=True): @@ -745,6 +757,83 @@ def inspect(self): """ print(_inspect(self)) # pragma: no cover + def set_condition_units(self, units): + """Set units of condition values in-place. + + .. versionadded:: TODO + + :Parameters: + + units: `str` or `Units` + + The units to be set on all condition values. + + :Returns: + + `None` + + **Examples** + + >>> q = cf.lt(9) + >>> q + + >>> q.set_condition_units('km') + >>> q + + >>> q.set_condition_units('seconds') + ... + ValueError: Units are not equivalent to query condition units + + >>> q = cf.lt(9, units='m') + >>> q + + >>> q.set_condition_units('km') + >>> q + + + >>> q = cf.lt(9) + >>> r = cf.ge(3000, units='m') + >>> s = q & r + >>> s + + >>> s.set_condition_units('km') + >>> s + + >>> q + + >>> r + + + """ + units = Units(units) + + compound = self._compound + if compound: + for r in compound: + r.set_condition_units(units) + + return + + value = self._value + if value is None: + return + + value_units = getattr(value, "Units", None) + if value_units is None: + # Value has no units + value = Data(value, units=units) + else: + # Value already has units + try: + value.Units = units + except ValueError: + raise ValueError( + f"Units {units!r} are not equivalent to " + f"query condition units {value_units!r}" + ) + + self._value = value + # ---------------------------------------------------------------- # Deprecated attributes and methods # ---------------------------------------------------------------- diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index 43cd29b381..94b7e956da 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -16,7 +16,7 @@ class NetCDFRead(cfdm.read_write.netcdf.NetCDFRead): """ - def _ncdimensions(self, ncvar): + def _ncdimensions(self, ncvar, ncdimensions=None, parent_ncvar=None): """Return a list of the netCDF dimensions corresponding to a netCDF variable. @@ -33,6 +33,21 @@ def _ncdimensions(self, ncvar): ncvar: `str` The netCDF variable name. + ncdimensions: sequence of `str`, optional + Use these netCDF dimensions, rather than retrieving them + from the netCDF variable itself. This allows the + dimensions of a domain variable to be parsed. Note that + this only parameter only needs to be used once because the + parsed domain dimensions are automatically stored in + `self.read_var['domain_ncdimensions'][ncvar]`. + + .. versionadded:: 3.11.0 + + parent_ncvar: `str`, optional + TODO + + .. versionadded:: TODO + :Returns: `list` @@ -43,6 +58,25 @@ def _ncdimensions(self, ncvar): >>> n._ncdimensions('humidity') ['time', 'lat', 'lon'] + For a variable compressed by gathering: + + dimensions: + lat=73; + lon=96; + landpoint=2381; + depth=4; + variables: + int landpoint(landpoint); + landpoint:compress="lat lon"; + float landsoilt(depth,landpoint); + landsoilt:long_name="soil temperature"; + landsoilt:units="K"; + + we would have + + >>> n._ncdimensions('landsoilt') + ['depth', 'lat', 'lon'] + """ g = self.read_vars @@ -54,16 +88,18 @@ def _ncdimensions(self, ncvar): ) if not cfa: - return super()._ncdimensions(ncvar) + return super()._ncdimensions( + ncvar, ncdimensions=ncdimensions, parent_ncvar=parent_ncvar + ) - # Still here? + # Still here? Then we have a CFA variable. ncdimensions = ( g["variable_attributes"][ncvar].get("cfa_dimensions", "").split() ) return list(map(str, ncdimensions)) - def _get_domain_axes(self, ncvar, allow_external=False): + def _get_domain_axes(self, ncvar, allow_external=False, parent_ncvar=None): """Return the domain axis identifiers that correspond to a netCDF variable's netCDF dimensions. @@ -79,6 +115,11 @@ def _get_domain_axes(self, ncvar, allow_external=False): If `True` and *ncvar* is an external variable then return an empty list. + parent_ncvar: `str`, optional + TODO + + .. versionadded:: TODO + :Returns: `list` @@ -103,7 +144,9 @@ def _get_domain_axes(self, ncvar, allow_external=False): if not cfa: return super()._get_domain_axes( - ncvar=ncvar, allow_external=allow_external + ncvar=ncvar, + allow_external=allow_external, + parent_ncvar=parent_ncvar, ) # Still here? @@ -127,6 +170,7 @@ def _create_data( unpacked_dtype=False, uncompress_override=None, parent_ncvar=None, + coord_ncvar=None, ): """TODO. @@ -143,6 +187,12 @@ def _create_data( uncompress_override: `bool`, optional + parent_ncvar: `str`, optional + + coord_ncvar: `str`, optional + + .. versionadded:: TODO + :Returns: `Data` @@ -165,6 +215,7 @@ def _create_data( unpacked_dtype=unpacked_dtype, uncompress_override=uncompress_override, parent_ncvar=parent_ncvar, + coord_ncvar=coord_ncvar, ) # ------------------------------------------------------------ @@ -291,16 +342,12 @@ def _create_Data( The netCDF variable from which to get units and calendar. """ - try: - compressed = array.get_compression_type() # TODO - except AttributeError: - compressed = False + chunks = self.read_vars.get("chunks", "auto") + + # dask_from_array = {'lock': array._dask_lock, + # 'asarray': array._dask_asarray} - if not compressed: - # Do not chunk compressed data (for now ...) - chunk = False - else: - chunk = self.read_vars.get("chunk", True) + # TODODASK - is this necessar given that each NetCDFArray.__getitem__ could open (and then close) it's own netCDF4.Dataset instance? return super()._create_Data( array=array, @@ -308,7 +355,7 @@ def _create_Data( calendar=calendar, ncvar=ncvar, loadd=loadd, - chunk=chunk, + chunks=chunks, **kwargs ) diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py index 17b80e21a2..d9211e60b1 100644 --- a/cf/read_write/netcdf/netcdfwrite.py +++ b/cf/read_write/netcdf/netcdfwrite.py @@ -477,10 +477,7 @@ def _write_cfa_data(self, ncvar, ncdimensions, data, cfvar): for size in array.shape ] - for ( - ncdim, - size, - ) in zip(cfa_dimensions, array.shape): + for (ncdim, size) in zip(cfa_dimensions, array.shape): if ncdim not in g["ncdim_to_size"]: # This cfa private dimension needs creating g["ncdim_to_size"][ncdim] = size diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 08a5a04ad8..30744d2703 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -64,6 +64,7 @@ def read( follow_symlinks=False, mask=True, warn_valid=False, + chunks="auto", ): """Read field constructs from netCDF, CDL, PP or UM fields datasets. @@ -502,6 +503,10 @@ def read( .. versionadded:: 1.5 + chunks: TODODASK + + .. versionadded:: TODODASK + umversion: deprecated at version 3.0.0 Use the *um* parameter instead. @@ -517,6 +522,9 @@ def read( select_options: deprecated at version 3.0.0 Use methods on the returned `FieldList` instead. + chunk: deprecated at version TODODASK + Use the *chunks* parameter instead. + :Returns: `FieldList` @@ -587,6 +595,14 @@ def read( "Use keyword 'um' instead.", ) # pragma: no cover + if chunk is not True: + _DEPRECATION_ERROR_FUNCTION_KWARGS( + "cf.read", + {"chunk": chunk}, + "Use keyword 'chunks' instead.", + version="TODODASK", + ) # pragma: no cover + # Parse select # TODO - delete the "if python36:" clause when Python 3.6 is # deprecated @@ -615,9 +631,8 @@ def read( raise ValueError("squeeze and unsqueeze can not both be True") if follow_symlinks and not recursive: raise ValueError( - "Can't set follow_symlinks={0} when recursive={1}".format( - follow_symlinks, recursive - ) + f"Can't set follow_symlinks={follow_symlinks!r} " + f"when recursive={recursive!r}" ) # Initialize the output list of fields @@ -754,7 +769,8 @@ def read( um=um, extra=extra, height_at_top_of_model=height_at_top_of_model, - chunk=chunk, + # chunk=chunk, + chunks=chunks, mask=mask, warn_valid=warn_valid, select=select, @@ -870,9 +886,9 @@ def _read_a_file( um=None, extra=None, height_at_top_of_model=None, - chunk=True, mask=True, warn_valid=False, + chunks="auto", select=None, ): """Read the contents of a single file into a field list. @@ -976,7 +992,7 @@ def _read_a_file( # return FieldList() extra_read_vars = { - "chunk": chunk, + "chunks": chunks, "fmt": selected_fmt, "ignore_read_error": ignore_read_error, # 'cfa' defaults to False. If the file has @@ -1051,9 +1067,7 @@ def _read_a_file( fmt=fmt, word_size=word_size, endian=endian, - chunk=chunk, - select=select, - ) # , mask=mask, warn_valid=warn_valid) + ) # PP fields are aggregated intrafile prior to interfile # aggregation diff --git a/cf/read_write/um/filearray.py b/cf/read_write/um/filearray.py index 4c933e29db..79c7426fc5 100644 --- a/cf/read_write/um/filearray.py +++ b/cf/read_write/um/filearray.py @@ -78,22 +78,13 @@ def __getitem__(self, indices): int_hdr = rec.int_hdr real_hdr = rec.real_hdr - array = rec.get_data().reshape( - int_hdr.item( - 17, - ), - int_hdr.item( - 18, - ), - ) + array = rec.get_data().reshape(int_hdr.item(17), int_hdr.item(18)) if indices is not Ellipsis: indices = parse_indices(array.shape, indices) array = get_subspace(array, indices) - LBUSER2 = int_hdr.item( - 38, - ) + LBUSER2 = int_hdr.item(38) if LBUSER2 == 3: # Return the numpy array now if it is a boolean array @@ -105,9 +96,7 @@ def __getitem__(self, indices): # Convert to a masked array # ------------------------------------------------------------ # Set the fill_value from BMDI - fill_value = real_hdr.item( - 17, - ) + fill_value = real_hdr.item(17) if fill_value != -1.0e30: # -1.0e30 is the flag for no missing data if integer_array: @@ -125,18 +114,14 @@ def __getitem__(self, indices): # either is available # ------------------------------------------------------------ # Treat BMKS as a scale_factor if it is neither 0 nor 1 - scale_factor = real_hdr.item( - 18, - ) + scale_factor = real_hdr.item(18) if scale_factor != 1.0 and scale_factor != 0.0: if integer_array: scale_factor = int(scale_factor) array *= scale_factor # Treat BDATUM as an add_offset if it is not 0 - add_offset = real_hdr.item( - 4, - ) + add_offset = real_hdr.item(4) if add_offset != 0.0: if integer_array: add_offset = int(add_offset) @@ -150,7 +135,7 @@ def __str__(self): return "%s%s in %s" % (self.header_offset, self.shape, self.file) @property - def file_pointer(self): + def file_address(self): """TODO.""" return (self.file, self.header_offset) @@ -163,7 +148,7 @@ def close(self): `None` - **Examples:** + **Examples** >>> f.close() @@ -177,7 +162,7 @@ def open(self): `um.umread.umfile.File` - **Examples:** + **Examples** >>> f.open() diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py index 2e89ddc271..c9db551ecf 100644 --- a/cf/read_write/um/umread.py +++ b/cf/read_write/um/umread.py @@ -2,10 +2,14 @@ import logging import textwrap from datetime import datetime +from uuid import uuid4 import cfdm import cftime +import dask.array as da from cfdm import Constructs +from dask.array.core import getter, normalize_chunks +from dask.base import tokenize from netCDF4 import date2num as netCDF4_date2num from numpy import arange as numpy_arange from numpy import arccos as numpy_arccos @@ -30,7 +34,7 @@ from ... import __Conventions__, __version__ from ...constants import _stash2standard_name from ...data import UMArray -from ...data.data import Data, Partition, PartitionMatrix +from ...data.data import Data from ...data.functions import _close_um_file, _open_um_file from ...decorators import ( _manage_log_level_via_verbose_attr, @@ -42,6 +46,9 @@ from ...functions import rtol as cf_rtol from ...units import Units +# import numpy as np + + logger = logging.getLogger(__name__) _cached_runid = {} @@ -877,9 +884,7 @@ def __init__( self.z_recs = recs[:nz] self.t_recs = recs[::nz] - LBUSER5 = recs[0].int_hdr.item( - lbuser5, - ) + LBUSER5 = recs[0].int_hdr.item(lbuser5) # self.cell_method_axis_name = {'area': 'area'} @@ -1750,14 +1755,7 @@ def header_lz(self, rec): """ int_hdr = rec.int_hdr - return [ - int_hdr.item( - lblev, - ), - int_hdr.item( - lbuser5, - ), - ] + return [int_hdr.item(lblev), int_hdr.item(lbuser5)] def header_z(self, rec): """Return the list [LBLEV, LBUSER5, BLEV, BRLEV, BHLEV, BHRLEV, @@ -1810,39 +1808,46 @@ def create_data(self): data_axes = [_axis["y"], _axis["x"]] + # Initialise a dask graph for the uncompressed array, and some + # dask.array.core.getter arguments + token = tokenize((nt, nz) + yx_shape, uuid4()) + name = (UMArray.__class__.__name__ + "-" + token,) + dsk = {} + full_slice = Ellipsis + if len(recs) == 1: # -------------------------------------------------------- # 0-d partition matrix # -------------------------------------------------------- + # TODODASK, check with DH the below is right (was a missing var) + file_data_types = set() + rec = recs[0] - fill_value = rec.real_hdr.item( - bmdi, - ) + fill_value = rec.real_hdr.item(bmdi) if fill_value == _BMDI_no_missing_data_value: fill_value = None - data = Data( - UMArray( - filename=filename, - ndim=2, - shape=yx_shape, - size=yx_size, - dtype=data_type_in_file(rec), - header_offset=rec.hdr_offset, - data_offset=rec.data_offset, - disk_length=rec.disk_length, - fmt=self.fmt, - word_size=self.word_size, - byte_ordering=self.byte_ordering, - ), - units=units, - fill_value=fill_value, + data_shape = yx_shape + + subarray = UMArray( + filename=filename, + ndim=2, + shape=yx_shape, + size=yx_size, + dtype=data_type_in_file(rec), + header_offset=rec.hdr_offset, + data_offset=rec.data_offset, + disk_length=rec.disk_length, + fmt=self.fmt, + word_size=self.word_size, + byte_ordering=self.byte_ordering, ) - logger.info( - " location = {}".format(yx_shape) - ) # pragma: no cover + dsk[name + (0, 0)] = (getter, subarray, full_slice, False, False) + + dtype = numpy_result_type(*file_data_types) + chunks = normalize_chunks((-1, -1), shape=data_shape, dtype=dtype) else: # -------------------------------------------------------- # 1-d or 2-d partition matrix @@ -1851,82 +1856,65 @@ def create_data(self): # Find the partition matrix shape pmshape = [n for n in (nt, nz) if n > 1] - pmndim = len(pmshape) - - partitions = [] - empty_list = [] - partitions_append = partitions.append - zero_to_LBROW = (0, LBROW) - zero_to_LBNPT = (0, LBNPT) - - if pmndim == 1: + if len(pmshape) == 1: # ---------------------------------------------------- # 1-d partition matrix # ---------------------------------------------------- - data_ndim = 3 if nz > 1: pmaxes = [_axis[self.z_axis]] data_shape = (nz, LBROW, LBNPT) - data_size = nz * yx_size else: pmaxes = [_axis["t"]] data_shape = (nt, LBROW, LBNPT) - data_size = nt * yx_size - partition_shape = [1, LBROW, LBNPT] + fmt = self.fmt + word_size = self.word_size + byte_ordering = self.byte_ordering for i, rec in enumerate(recs): # Find the data type of the array in the file file_data_type = data_type_in_file(rec) file_data_types.add(file_data_type) + shape = (1,) + yx_shape + subarray = UMArray( filename=filename, - ndim=2, - shape=yx_shape, + ndim=3, + shape=shape, size=yx_size, dtype=file_data_type, header_offset=rec.hdr_offset, data_offset=rec.data_offset, disk_length=rec.disk_length, - fmt=self.fmt, - word_size=self.word_size, - byte_ordering=self.byte_ordering, + fmt=fmt, + word_size=word_size, + byte_ordering=byte_ordering, ) - location = [(i, i + 1), zero_to_LBROW, zero_to_LBNPT] - - partitions_append( - Partition( - subarray=subarray, - location=location, - shape=partition_shape, - axes=data_axes, - flip=empty_list, - part=empty_list, - Units=units, - ) + dsk[name + (i, 0, 0)] = ( + getter, + subarray, + full_slice, + False, + False, ) - logger.info( - " header_offset = {}, location = {}".format( - rec.hdr_offset, location - ) - ) # pragma: no cover - - # Populate the 1-d partition matrix - matrix = numpy_array(partitions, dtype=object) + dtype = numpy_result_type(*file_data_types) + chunks = normalize_chunks( + (1, -1, -1), shape=data_shape, dtype=dtype + ) else: # ---------------------------------------------------- # 2-d partition matrix # ---------------------------------------------------- pmaxes = [_axis["t"], _axis[self.z_axis]] data_shape = (nt, nz, LBROW, LBNPT) - data_size = nt * nz * yx_size - data_ndim = 4 - partition_shape = [1, 1, LBROW, LBNPT] + fmt = self.fmt + word_size = self.word_size + byte_ordering = self.byte_ordering for i, rec in enumerate(recs): # Find T and Z axis indices @@ -1936,64 +1924,47 @@ def create_data(self): file_data_type = data_type_in_file(rec) file_data_types.add(file_data_type) + shape = (1, 1) + yx_shape + subarray = UMArray( filename=filename, - ndim=2, - shape=yx_shape, + ndim=4, + shape=shape, size=yx_size, dtype=file_data_type, header_offset=rec.hdr_offset, data_offset=rec.data_offset, disk_length=rec.disk_length, - fmt=self.fmt, - word_size=self.word_size, - byte_ordering=self.byte_ordering, + fmt=fmt, + word_size=word_size, + byte_ordering=byte_ordering, ) - location = [ - (t, t + 1), - (z, z + 1), - zero_to_LBROW, - zero_to_LBNPT, - ] - - partitions_append( - Partition( - subarray=subarray, - location=location, - shape=partition_shape, - axes=data_axes, - flip=empty_list, - part=empty_list, - Units=units, - ) + dsk[name + (t, z, 0, 0)] = ( + getter, + subarray, + full_slice, + False, + False, ) - logger.info( - " location = {}".format(location) - ) # pragma: no cover + dtype = numpy_result_type(*file_data_types) + chunks = normalize_chunks( + (1, 1, -1, -1), shape=data_shape, dtype=dtype + ) - # Populate the 2-d partition matrix - matrix = numpy_array(partitions, dtype=object) - matrix.resize(pmshape) + data_axes = pmaxes + data_axes - data_axes = pmaxes + data_axes + # Set the data array + fill_value = recs[0].real_hdr.item(bmdi) + if fill_value == _BMDI_no_missing_data_value: + fill_value = None - # Set the data array - fill_value = recs[0].real_hdr.item( - bmdi, - ) - if fill_value == _BMDI_no_missing_data_value: - fill_value = None - - data = Data(units=units, fill_value=fill_value) + # Create the dask array + array = da.Array(dsk, name[0], chunks=chunks, dtype=dtype) - data._axes = data_axes - data._shape = data_shape - data._ndim = data_ndim - data._size = data_size - data.partitions = PartitionMatrix(matrix, pmaxes) - data.dtype = numpy_result_type(*file_data_types) + # Create the Data object + data = Data(array, units=units, fill_value=fill_value) self.data = data self.data_axes = data_axes @@ -2168,9 +2139,7 @@ def latitude_longitude_2d_aux_coordinates(self, yc, xc): lat, lon = _cached_latlon.get(cache_key, (None, None)) if lat is None: - lat, lon = self.unrotated_latlon( - yc.varray, xc.varray, BPLAT, BPLON - ) + lat, lon = self.unrotated_latlon(yc.array, xc.array, BPLAT, BPLON) atol = self.atol if abs(BDX) >= atol and abs(BDY) >= atol: @@ -2247,14 +2216,7 @@ def model_level_number_coordinate(self, aux=False): out : `AuxiliaryCoordinate` or `DimensionCoordinate` or `None` """ - array = tuple( - [ - rec.int_hdr.item( - lblev, - ) - for rec in self.z_recs - ] - ) + array = tuple([rec.int_hdr.item(lblev) for rec in self.z_recs]) key = array c = _cached_model_level_number_coordinate.get(key, None) @@ -2329,12 +2291,7 @@ def data_type_in_file(self, rec): """ # Find the data type - if ( - rec.int_hdr.item( - lbuser2, - ) - == 3 - ): + if rec.int_hdr.item(lbuser2) == 3: # Boolean return numpy_dtype(bool) else: @@ -2378,12 +2335,7 @@ def pseudolevel_coordinate(self, LBUSER5): else: # 'Z' aggregation has been done along the pseudolevel axis array = numpy_array( - [ - rec.int_hdr.item( - lbuser5, - ) - for rec in self.z_recs - ], + [rec.int_hdr.item(lbuser5) for rec in self.z_recs], dtype=self.int_hdr_dtype, ) self.z_axis = "p" @@ -2622,9 +2574,7 @@ def time_coordinate(self, axiscode): IB = self.lbtim_ib - if IB <= 1 or vtimes.item(0,) >= dtimes.item( - 0, - ): + if IB <= 1 or vtimes.item(0) >= dtimes.item(0): array = vtimes bounds = None climatology = False @@ -3029,14 +2979,7 @@ def z_coordinate(self, axiscode): ) # pragma: no cover z_recs = self.z_recs - array = tuple( - [ - rec.real_hdr.item( - blev, - ) - for rec in z_recs - ] - ) + array = tuple([rec.real_hdr.item(blev) for rec in z_recs]) bounds0 = tuple( [rec.real_hdr[brlev] for rec in z_recs] ) # lower level boundary @@ -3096,13 +3039,7 @@ def z_reference_coordinate(self, axiscode): ) # pragma: no cover array = numpy_array( - [ - rec.real_hdr.item( - brlev, - ) - for rec in self.z_recs - ], - dtype=float, + [rec.real_hdr.item(brlev) for rec in self.z_recs], dtype=float ) LBVC = self.lbvc @@ -3117,12 +3054,8 @@ def z_reference_coordinate(self, axiscode): if not 128 <= LBVC <= 139: bounds = [] for rec in self.z_recs: - BRLEV = rec.real_hdr.item( - brlev, - ) - BRSVD1 = rec.real_hdr.item( - brsvd1, - ) + BRLEV = rec.real_hdr.item(brlev) + BRSVD1 = rec.real_hdr.item(brsvd1) if abs(BRSVD1 - BRLEV) >= atol: bounds = None diff --git a/cf/regrid/utils.py b/cf/regrid/utils.py index f5b879d31c..93e4163fa7 100644 --- a/cf/regrid/utils.py +++ b/cf/regrid/utils.py @@ -23,11 +23,7 @@ def regrid_compute_mass_grid( - valuefield, - areafield, - dofrac=False, - fracfield=None, - uninitval=422397696.0, + valuefield, areafield, dofrac=False, fracfield=None, uninitval=422397696.0 ): """Compute the mass of an `ESMF` Field. @@ -575,9 +571,7 @@ def regrid_get_reordered_sections( if axis_order is not None: for axis in axis_order: axis_key = src.dimension_coordinate( - filter_by_axis=(axis,), - default=None, - key=True, + filter_by_axis=(axis,), default=None, key=True ) if axis_key is not None: if axis_key in regrid_axes: @@ -1080,9 +1074,7 @@ def regrid_update_coordinates( dst_data_axes = dst.constructs.data_axes() for aux_key, aux in dst.auxiliary_coordinates( - filter_by_axis=dst_axis_keys, - axis_mode="subset", - todict=True, + filter_by_axis=dst_axis_keys, axis_mode="subset", todict=True ).items(): aux_axes = [ axis_map[key_d] for key_d in dst_data_axes[aux_key] @@ -1124,9 +1116,7 @@ def regrid_update_coordinates( f.set_construct(aux, axes=[src_axis_key]) for aux_key, aux in dst.auxiliary_coordinates( - filter_by_axis=dst_axis_keys, - axis_mode="subset", - todict=True, + filter_by_axis=dst_axis_keys, axis_mode="subset", todict=True ).items(): aux_axes = dst.get_data_axes(aux_key) if aux_axes == tuple(dst_axis_keys): @@ -1218,12 +1208,7 @@ def regrid_initialize(): def create_Regrid( - srcfield, - dstfield, - srcfracfield, - dstfracfield, - method, - ignore_degenerate, + srcfield, dstfield, srcfracfield, dstfracfield, method, ignore_degenerate ): """Create an `ESMF` regrid operator. @@ -1491,10 +1476,7 @@ def create_Grid( DimensionCoordinate( data=Data(0), bounds=Data( - [ - np.finfo("float32").epsneg, - np.finfo("float32").eps, - ] + [np.finfo("float32").epsneg, np.finfo("float32").eps] ), ) ] + coords @@ -1510,10 +1492,7 @@ def create_Grid( max_index = np.array(shape, dtype="int32") if use_bounds: if ndim < 3: - staggerLocs = [ - ESMF.StaggerLoc.CORNER, - ESMF.StaggerLoc.CENTER, - ] + staggerLocs = [ESMF.StaggerLoc.CORNER, ESMF.StaggerLoc.CENTER] else: staggerLocs = [ ESMF.StaggerLoc.CENTER_VCENTER, diff --git a/cf/test/create_test_files.py b/cf/test/create_test_files.py index 9ac0f43742..bbbacd86ae 100644 --- a/cf/test/create_test_files.py +++ b/cf/test/create_test_files.py @@ -798,14 +798,7 @@ def _jj(shape, list_values): aux7 = n.createVariable("aux7", "f8", ("lat",)) aux7[...] = numpy.arange(lat.size) - aux8 = n.createVariable( - "aux8", - "f8", - ( - "lon", - "lat", - ), - ) + aux8 = n.createVariable("aux8", "f8", ("lon", "lat")) aux8[...] = numpy.arange(lon.size * lat.size).reshape(lon.size, lat.size) aux9 = n.createVariable("aux9", "f8", ("time", "height")) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index a873c123a2..70d032befd 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -7,7 +7,7 @@ from functools import reduce from operator import mul -import numpy +import numpy as np SCIPY_AVAILABLE = False try: @@ -22,49 +22,40 @@ import cf - -def reshape_array(a, axes): - new_order = [i for i in range(a.ndim) if i not in axes] - new_order.extend(axes) - b = numpy.transpose(a, new_order) - new_shape = b.shape[: b.ndim - len(axes)] - new_shape += (reduce(mul, b.shape[b.ndim - len(axes) :]),) - b = b.reshape(new_shape) - return b - - # Variables for _collapse -a = numpy.arange(-100, 200.0, dtype=float).reshape(3, 4, 5, 5) - -w = numpy.arange(1, 301.0, dtype=float).reshape(a.shape) +a = np.arange(-100, 200.0, dtype=float).reshape(3, 4, 5, 5) +w = np.arange(1, 301.0, dtype=float).reshape(a.shape) w[-1, -1, ...] = w[-1, -1, ...] * 2 w /= w.min() -ones = numpy.ones(a.shape, dtype=float) - -ma = numpy.ma.arange(-100, 200.0, dtype=float).reshape(3, 4, 5, 5) -ma[:, 1, 4, 4] = numpy.ma.masked -ma[0, :, 2, 3] = numpy.ma.masked -ma[0, 3, :, 3] = numpy.ma.masked -ma[1, 2, 3, :] = numpy.ma.masked +ones = np.ones(a.shape, dtype=float) +# TODODASK: these can be moved into the lone tests that use them now +ma = np.ma.arange(-100, 200.0, dtype=float).reshape(3, 4, 5, 5) +ma[:, 1, 4, 4] = np.ma.masked +ma[0, :, 2, 3] = np.ma.masked +ma[0, 3, :, 3] = np.ma.masked +ma[1, 2, 3, :] = np.ma.masked -mw = numpy.ma.array(w, mask=ma.mask) -mones = numpy.ma.array(ones, mask=ma.mask) +# If True, all tests that will not pass temporarily due to the LAMA-to-Dask +# migration will be skipped. These skips will be incrementally removed as the +# migration progresses. TODODASK: ensure all skips are removed once complete. +TEST_DASKIFIED_ONLY = True -class DataTest(unittest.TestCase): +def reshape_array(a, axes): + new_order = [i for i in range(a.ndim) if i not in axes] + new_order.extend(axes) + b = np.transpose(a, new_order) + new_shape = b.shape[: b.ndim - len(axes)] + new_shape += (reduce(mul, b.shape[b.ndim - len(axes) :]),) + b = b.reshape(new_shape) + return b - chunk_sizes = (100000, 300, 34) # 17 - original_chunksize = cf.chunksize() - axes_permutations = [ - axes - for n in range(1, a.ndim + 1) - for axes in itertools.permutations(range(a.ndim), n) - ] +class DataTest(unittest.TestCase): axes_combinations = [ axes @@ -82,17 +73,17 @@ class DataTest(unittest.TestCase): os.path.dirname(os.path.abspath(__file__)), "test_file2.nc" ) + # TODODASK: these can be moved into the lone tests that use them now a = a w = w ma = ma - mw = mw ones = ones - mones = mones test_only = [] - # test_only = ["NOTHING!!!!!"] - # test_only = [ - # 'test_Data_percentile', + # test_only = ['NOTHING!!!!!'] + # test_only = [ + # "test_Data___setitem__", + # ] # 'test_Data_trigonometric_hyperbolic' # 'test_Data_AUXILIARY_MASK', # 'test_Data_datum', @@ -140,11 +131,12 @@ class DataTest(unittest.TestCase): # test_only = ['test_Data_clip'] # test_only = ['test_Data__init__dtype_mask'] + @unittest.skipIf(TEST_DASKIFIED_ONLY, "hits unexpected kwarg 'ndim'") def test_Data_halo(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - d = cf.Data(numpy.arange(12).reshape(3, 4), "m") + d = cf.Data(np.arange(12).reshape(3, 4), "m") d[-1, -1] = cf.masked d[1, 1] = cf.masked @@ -222,62 +214,59 @@ def test_Data_halo(self): # [ 8 8 9 10 -- --] # [ 8 8 9 10 -- --]] + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_apply_masking(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return a = self.ma - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(a, units="m") + d = cf.Data(a, units="m") - self.assertTrue((a == d.array).all()) - self.assertTrue((a.mask == d.mask.array).all()) + self.assertTrue((a == d.array).all()) + self.assertTrue((a.mask == d.mask.array).all()) - b = a.copy() - e = d.apply_masking() - self.assertTrue((b == e.array).all()) - self.assertTrue((b.mask == e.mask.array).all()) + b = a.copy() + e = d.apply_masking() + self.assertTrue((b == e.array).all()) + self.assertTrue((b.mask == e.mask.array).all()) - b = numpy.ma.where(a == 0, numpy.ma.masked, a) - e = d.apply_masking(fill_values=[0]) - self.assertTrue((b == e.array).all()) - self.assertTrue((b.mask == e.mask.array).all()) + b = np.ma.where(a == 0, np.ma.masked, a) + e = d.apply_masking(fill_values=[0]) + self.assertTrue((b == e.array).all()) + self.assertTrue((b.mask == e.mask.array).all()) - b = numpy.ma.where((a == 0) | (a == 11), numpy.ma.masked, a) - e = d.apply_masking(fill_values=[0, 11]) - self.assertTrue((b == e.array).all()) - self.assertTrue((b.mask == e.mask.array).all()) + b = np.ma.where((a == 0) | (a == 11), np.ma.masked, a) + e = d.apply_masking(fill_values=[0, 11]) + self.assertTrue((b == e.array).all()) + self.assertTrue((b.mask == e.mask.array).all()) - b = numpy.ma.where(a < 30, numpy.ma.masked, a) - e = d.apply_masking(valid_min=30) - self.assertTrue((b == e.array).all()) - self.assertTrue((b.mask == e.mask.array).all()) + b = np.ma.where(a < 30, np.ma.masked, a) + e = d.apply_masking(valid_min=30) + self.assertTrue((b == e.array).all()) + self.assertTrue((b.mask == e.mask.array).all()) - b = numpy.ma.where(a > -60, numpy.ma.masked, a) - e = d.apply_masking(valid_max=-60) - self.assertTrue((b == e.array).all()) - self.assertTrue((b.mask == e.mask.array).all()) + b = np.ma.where(a > -60, np.ma.masked, a) + e = d.apply_masking(valid_max=-60) + self.assertTrue((b == e.array).all()) + self.assertTrue((b.mask == e.mask.array).all()) - b = numpy.ma.where((a < -20) | (a > 80), numpy.ma.masked, a) - e = d.apply_masking(valid_range=[-20, 80]) - self.assertTrue((b == e.array).all()) - self.assertTrue((b.mask == e.mask.array).all()) + b = np.ma.where((a < -20) | (a > 80), np.ma.masked, a) + e = d.apply_masking(valid_range=[-20, 80]) + self.assertTrue((b == e.array).all()) + self.assertTrue((b.mask == e.mask.array).all()) - d.set_fill_value(70) + d.set_fill_value(70) - b = numpy.ma.where(a == 70, numpy.ma.masked, a) - e = d.apply_masking(fill_values=True) - self.assertTrue((b == e.array).all()) - self.assertTrue((b.mask == e.mask.array).all()) + b = np.ma.where(a == 70, np.ma.masked, a) + e = d.apply_masking(fill_values=True) + self.assertTrue((b == e.array).all()) + self.assertTrue((b.mask == e.mask.array).all()) - b = numpy.ma.where( - (a == 70) | (a < 20) | (a > 80), numpy.ma.masked, a - ) - e = d.apply_masking(fill_values=True, valid_range=[20, 80]) - self.assertTrue((b == e.array).all()) - self.assertTrue((b.mask == e.mask.array).all()) + b = np.ma.where((a == 70) | (a < 20) | (a > 80), np.ma.masked, a) + e = d.apply_masking(fill_values=True, valid_range=[20, 80]) + self.assertTrue((b == e.array).all()) + self.assertTrue((b.mask == e.mask.array).all()) def test_Data_convolution_filter(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: @@ -296,31 +285,23 @@ def test_Data_convolution_filter(self): d = cf.Data(self.ma, units="m") - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - # Test user weights in different modes - for mode in ( - "reflect", - "constant", - "nearest", - "mirror", - "wrap", - ): - b = convolve1d(d.array, window, axis=-1, mode=mode) - e = d.convolution_filter( - window=window, axis=-1, mode=mode, cval=0.0 - ) - self.assertTrue((e.array == b).all()) - # --- End: for + # Test user weights in different modes + for mode in ("reflect", "constant", "nearest", "mirror", "wrap"): + b = convolve1d(d.array, window, axis=-1, mode=mode) + e = d.convolution_filter( + window=window, axis=-1, mode=mode, cval=0.0 + ) + self.assertTrue((e.array == b).all()) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_diff(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - a = numpy.ma.arange(12.0).reshape(3, 4) + a = np.ma.arange(12.0).reshape(3, 4) a[1, 1] = 4.5 a[2, 2] = 10.5 - a[1, 2] = numpy.ma.masked + a[1, 2] = np.ma.masked d = cf.Data(a) @@ -333,7 +314,7 @@ def test_Data_diff(self): for n in (0, 1, 2): for axis in (0, 1, -1, -2): - a_diff = numpy.diff(a, n=n, axis=axis) + a_diff = np.diff(a, n=n, axis=axis) d_diff = d.diff(n=n, axis=axis) self.assertTrue((a_diff == d_diff).all()) @@ -345,22 +326,20 @@ def test_Data_diff(self): self.assertTrue(e.equals(d_diff)) # --- End: for - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(self.ma, "km") - for n in (0, 1, 2): - for axis in (0, 1, 2, 3): - a_diff = numpy.diff(self.ma, n=n, axis=axis) - d_diff = d.diff(n=n, axis=axis) - self.assertTrue((a_diff == d_diff).all()) - self.assertTrue((a_diff.mask == d_diff.mask).all()) - # --- End: for + d = cf.Data(self.ma, "km") + for n in (0, 1, 2): + for axis in (0, 1, 2, 3): + a_diff = np.diff(self.ma, n=n, axis=axis) + d_diff = d.diff(n=n, axis=axis) + self.assertTrue((a_diff == d_diff).all()) + self.assertTrue((a_diff.mask == d_diff.mask).all()) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_compressed(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - a = numpy.ma.arange(12).reshape(3, 4) + a = np.ma.arange(12).reshape(3, 4) d = cf.Data(a) self.assertTrue((d.array == a).all()) @@ -371,8 +350,8 @@ def test_Data_compressed(self): self.assertIsNone(x) self.assertTrue(e.equals(d.compressed())) - a[1, 1] = numpy.ma.masked - a[2, 3] = numpy.ma.masked + a[1, 1] = np.ma.masked + a[2, 3] = np.ma.masked d = cf.Data(a) self.assertTrue((d.array == a).all()) @@ -384,16 +363,13 @@ def test_Data_compressed(self): self.assertIsNone(x) self.assertTrue(e.equals(d.compressed())) - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(self.a, "km") - self.assertTrue((self.a.flatten() == d.compressed()).all()) + d = cf.Data(self.a, "km") + self.assertTrue((self.a.flatten() == d.compressed()).all()) - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(self.ma, "km") - self.assertTrue((self.ma.compressed() == d.compressed()).all()) + d = cf.Data(self.ma, "km") + self.assertTrue((self.ma.compressed() == d.compressed()).all()) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_shape'") def test_Data_stats(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -404,6 +380,7 @@ def test_Data_stats(self): _ = d.stats(all=True) _ = d.stats(mean_of_upper_decile=True, range=False) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_shape'") def test_Data__init__dtype_mask(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -430,14 +407,14 @@ def test_Data__init__dtype_mask(self): self.assertEqual(d.count(), 3) self.assertEqual(d.shape, (2, 3)) - a = numpy.ma.array( + a = np.ma.array( [[280.0, -99, -99, -99], [281.0, 279.0, 278.0, 279.0]], dtype=float, mask=[[0, 1, 1, 1], [0, 0, 0, 0]], ) d = cf.Data([[280, -99, -99, -99], [281, 279, 278, 279]]) - self.assertEqual(d.dtype, numpy.dtype(int)) + self.assertEqual(d.dtype, np.dtype(int)) d = cf.Data( [[280, -99, -99, -99], [281, 279, 278, 279]], @@ -448,22 +425,22 @@ def test_Data__init__dtype_mask(self): self.assertEqual(d.dtype, a.dtype) self.assertEqual(d.mask.shape, a.mask.shape) self.assertTrue((d.array == a).all()) - self.assertTrue((d.mask.array == numpy.ma.getmaskarray(a)).all()) + self.assertTrue((d.mask.array == np.ma.getmaskarray(a)).all()) - a = numpy.array( + a = np.array( [[280.0, -99, -99, -99], [281.0, 279.0, 278.0, 279.0]], dtype=float ) - mask = numpy.ma.masked_all(a.shape).mask + mask = np.ma.masked_all(a.shape).mask d = cf.Data([[280, -99, -99, -99], [281, 279, 278, 279]], dtype=float) self.assertEqual(d.dtype, a.dtype) self.assertEqual(d.mask.shape, mask.shape) self.assertTrue((d.array == a).all()) - self.assertTrue((d.mask.array == numpy.ma.getmaskarray(a)).all()) + self.assertTrue((d.mask.array == np.ma.getmaskarray(a)).all()) # Mask broadcasting - a = numpy.ma.array( + a = np.ma.array( [[280.0, -99, -99, -99], [281.0, 279.0, 278.0, 279.0]], dtype=float, mask=[[0, 1, 1, 0], [0, 1, 1, 0]], @@ -478,46 +455,44 @@ def test_Data__init__dtype_mask(self): self.assertEqual(d.dtype, a.dtype) self.assertEqual(d.mask.shape, a.mask.shape) self.assertTrue((d.array == a).all()) - self.assertTrue((d.mask.array == numpy.ma.getmaskarray(a)).all()) + self.assertTrue((d.mask.array == np.ma.getmaskarray(a)).all()) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_digitize(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return for a in [ - numpy.arange(120).reshape(3, 2, 20), - numpy.ma.arange(120).reshape(3, 2, 20), + np.arange(120).reshape(3, 2, 20), + np.ma.arange(120).reshape(3, 2, 20), ]: - if numpy.ma.isMA(a): - a[0, 1, [2, 5, 6, 7, 8]] = numpy.ma.masked - a[2, 0, [12, 14, 17]] = numpy.ma.masked - - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(a, "km") - - for upper in (False, True): - for bins in ( - [2, 6, 10, 50, 100], - [[2, 6], [6, 10], [10, 50], [50, 100]], - ): - e = d.digitize(bins, upper=upper, open_ends=True) - b = numpy.digitize( - a, [2, 6, 10, 50, 100], right=upper - ) - - self.assertTrue((e.array == b).all()) - - e.where( - cf.set([e.minimum(), e.maximum()]), - cf.masked, - e - 1, - inplace=True, - ) - f = d.digitize(bins, upper=upper) - self.assertTrue(e.equals(f, verbose=2)) + if np.ma.isMA(a): + a[0, 1, [2, 5, 6, 7, 8]] = np.ma.masked + a[2, 0, [12, 14, 17]] = np.ma.masked + + d = cf.Data(a, "km") + + for upper in (False, True): + for bins in ( + [2, 6, 10, 50, 100], + [[2, 6], [6, 10], [10, 50], [50, 100]], + ): + e = d.digitize(bins, upper=upper, open_ends=True) + b = np.digitize(a, [2, 6, 10, 50, 100], right=upper) + + self.assertTrue((e.array == b).all()) + + e.where( + cf.set([e.minimum(), e.maximum()]), + cf.masked, + e - 1, + inplace=True, + ) + f = d.digitize(bins, upper=upper) + self.assertTrue(e.equals(f, verbose=2)) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_cumsum(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -528,22 +503,21 @@ def test_Data_cumsum(self): self.assertIsNone(e.cumsum(axis=0, inplace=True)) self.assertTrue(e.equals(f, verbose=2)) - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(self.a) + d = cf.Data(self.a) - for i in range(d.ndim): - b = numpy.cumsum(self.a, axis=i) - e = d.cumsum(axis=i) - self.assertTrue((e.array == b).all()) + for i in range(d.ndim): + b = np.cumsum(self.a, axis=i) + e = d.cumsum(axis=i) + self.assertTrue((e.array == b).all()) - d = cf.Data(self.ma) + d = cf.Data(self.ma) - for i in range(d.ndim): - b = numpy.cumsum(self.ma, axis=i) - e = d.cumsum(axis=i, masked_as_zero=False) - self.assertTrue(cf.functions._numpy_allclose(e.array, b)) + for i in range(d.ndim): + b = np.cumsum(self.ma, axis=i) + e = d.cumsum(axis=i, masked_as_zero=False) + self.assertTrue(cf.functions._numpy_allclose(e.array, b)) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_flatten(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -552,37 +526,32 @@ def test_Data_flatten(self): self.assertTrue(d.equals(d.flatten([]), verbose=2)) self.assertIsNone(d.flatten(inplace=True)) - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(self.ma.copy()) - - b = self.ma.flatten() - for axes in (None, list(range(d.ndim))): - e = d.flatten(axes) - self.assertEqual(e.ndim, 1) - self.assertEqual(e.shape, b.shape) - self.assertTrue(cf.functions._numpy_allclose(e.array, b)) - - for axes in self.axes_combinations: - e = d.flatten(axes) + d = cf.Data(self.ma.copy()) - if len(axes) <= 1: - shape = d.shape - else: - shape = [ - n for i, n in enumerate(d.shape) if i not in axes - ] - shape.insert( - sorted(axes)[0], - numpy.prod( - [n for i, n in enumerate(d.shape) if i in axes] - ), - ) + b = self.ma.flatten() + for axes in (None, list(range(d.ndim))): + e = d.flatten(axes) + self.assertEqual(e.ndim, 1) + self.assertEqual(e.shape, b.shape) + self.assertTrue(cf.functions._numpy_allclose(e.array, b)) + + for axes in self.axes_combinations: + e = d.flatten(axes) + + if len(axes) <= 1: + shape = d.shape + else: + shape = [n for i, n in enumerate(d.shape) if i not in axes] + shape.insert( + sorted(axes)[0], + np.prod([n for i, n in enumerate(d.shape) if i in axes]), + ) - self.assertEqual(e.shape, tuple(shape)) - self.assertEqual(e.ndim, d.ndim - len(axes) + 1) - self.assertEqual(e.size, d.size) + self.assertEqual(e.shape, tuple(shape)) + self.assertEqual(e.ndim, d.ndim - len(axes) + 1) + self.assertEqual(e.size, d.size) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute 'partitions'") def test_Data_CachedArray(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -592,29 +561,27 @@ def test_Data_CachedArray(self): cf.tempdir(self.tempdir) original_FMF = cf.free_memory_factor(1 - factor) - d = cf.Data(numpy.arange(100)) + d = cf.Data(np.arange(100)) cf.free_memory_factor(factor) _ = d.array for partition in d.partitions.flat: self.assertTrue(partition.in_cached_file) - _ = numpy.arange(1000000).reshape(100, 10000) + _ = np.arange(1000000).reshape(100, 10000) - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - cf.free_memory_factor(1 - factor) - d = cf.Data(numpy.arange(10000).reshape(100, 100)) - cf.free_memory_factor(factor) + cf.free_memory_factor(1 - factor) + d = cf.Data(np.arange(10000).reshape(100, 100)) + cf.free_memory_factor(factor) - _ = d.array + _ = d.array - for partition in d.partitions.flat: - self.assertTrue(partition.in_cached_file) - # --- End: for + for partition in d.partitions.flat: + self.assertTrue(partition.in_cached_file) cf.free_memory_factor(original_FMF) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_cached_arithmetic_units(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -650,6 +617,7 @@ def test_Data_cached_arithmetic_units(self): # Reset cf.constants.CONSTANTS["FM_THRESHOLD"] = fmt + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_auxiliary_mask'") def test_Data_AUXILIARY_MASK(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -659,7 +627,7 @@ def test_Data_AUXILIARY_MASK(self): self.assertIsNone(d._auxiliary_mask_return()) d = cf.Data.empty((90, 60)) - m = numpy.full(d.shape, fill_value=False, dtype=bool) + m = np.full(d.shape, fill_value=False, dtype=bool) self.assertIsNone(d._auxiliary_mask) self.assertEqual(d._auxiliary_mask_return().shape, m.shape) @@ -674,7 +642,7 @@ def test_Data_AUXILIARY_MASK(self): self.assertTrue((d._auxiliary_mask_return() == m).all()) d = cf.Data.empty((90, 60)) - m = numpy.full(d.shape, fill_value=False, dtype=bool) + m = np.full(d.shape, fill_value=False, dtype=bool) d = cf.Data.empty((90, 60)) d._auxiliary_mask_add_component(cf.Data(m[0:1, :])) @@ -693,319 +661,415 @@ def test_Data_AUXILIARY_MASK(self): self.assertEqual(d._auxiliary_mask_return().shape, m.shape) self.assertTrue((d._auxiliary_mask_return() == m).all()) - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - # -------------------------------------------------------- - d = cf.Data(numpy.arange(120).reshape(30, 4)) - e = cf.Data(numpy.arange(120, 280).reshape(40, 4)) + # -------------------------------------------------------- + d = cf.Data(np.arange(120).reshape(30, 4)) + e = cf.Data(np.arange(120, 280).reshape(40, 4)) - fm = cf.Data.full((70, 4), fill_value=False, dtype=bool) + fm = cf.Data.full((70, 4), fill_value=False, dtype=bool) - fm[0, 0] = True - fm[10, 2] = True - fm[20, 1] = True + fm[0, 0] = True + fm[10, 2] = True + fm[20, 1] = True - dm = fm[:30] - d._auxiliary_mask = [dm] + dm = fm[:30] + d._auxiliary_mask = [dm] - f = cf.Data.concatenate([d, e], axis=0) - self.assertEqual(f.shape, fm.shape) - self.assertTrue((f._auxiliary_mask_return().array == fm).all()) + f = cf.Data.concatenate([d, e], axis=0) + self.assertEqual(f.shape, fm.shape) + self.assertTrue((f._auxiliary_mask_return().array == fm).all()) - # -------------------------------------------------------- - d = cf.Data(numpy.arange(120).reshape(30, 4)) - e = cf.Data(numpy.arange(120, 280).reshape(40, 4)) + # -------------------------------------------------------- + d = cf.Data(np.arange(120).reshape(30, 4)) + e = cf.Data(np.arange(120, 280).reshape(40, 4)) - fm = cf.Data.full((70, 4), False, bool) - fm[50, 0] = True - fm[60, 2] = True - fm[65, 1] = True + fm = cf.Data.full((70, 4), False, bool) + fm[50, 0] = True + fm[60, 2] = True + fm[65, 1] = True - em = fm[30:] - e._auxiliary_mask = [em] + em = fm[30:] + e._auxiliary_mask = [em] - f = cf.Data.concatenate([d, e], axis=0) - self.assertEqual(f.shape, fm.shape) - self.assertTrue((f._auxiliary_mask_return().array == fm).all()) + f = cf.Data.concatenate([d, e], axis=0) + self.assertEqual(f.shape, fm.shape) + self.assertTrue((f._auxiliary_mask_return().array == fm).all()) - # -------------------------------------------------------- - d = cf.Data(numpy.arange(120).reshape(30, 4)) - e = cf.Data(numpy.arange(120, 280).reshape(40, 4)) + # -------------------------------------------------------- + d = cf.Data(np.arange(120).reshape(30, 4)) + e = cf.Data(np.arange(120, 280).reshape(40, 4)) - fm = cf.Data.full((70, 4), False, bool) - fm[0, 0] = True - fm[10, 2] = True - fm[20, 1] = True - fm[50, 0] = True - fm[60, 2] = True - fm[65, 1] = True + fm = cf.Data.full((70, 4), False, bool) + fm[0, 0] = True + fm[10, 2] = True + fm[20, 1] = True + fm[50, 0] = True + fm[60, 2] = True + fm[65, 1] = True - dm = fm[:30] - d._auxiliary_mask = [dm] - em = fm[30:] - e._auxiliary_mask = [em] + dm = fm[:30] + d._auxiliary_mask = [dm] + em = fm[30:] + e._auxiliary_mask = [em] - f = cf.Data.concatenate([d, e], axis=0) - self.assertEqual(f.shape, fm.shape) - self.assertTrue((f._auxiliary_mask_return().array == fm).all()) + f = cf.Data.concatenate([d, e], axis=0) + self.assertEqual(f.shape, fm.shape) + self.assertTrue((f._auxiliary_mask_return().array == fm).all()) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "TypeError: 'int' is not iterable") def test_Data___contains__(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data([[0.0, 1, 2], [3, 4, 5]], units="m") - self.assertIn(4, d) - self.assertNotIn(40, d) - self.assertIn(cf.Data(3), d) - self.assertIn(cf.Data([[[[3]]]]), d) - value = d[1, 2] - value.Units *= 2 - value.squeeze(0) - self.assertIn(value, d) - self.assertIn(numpy.array([[[2]]]), d) + d = cf.Data([[0.0, 1, 2], [3, 4, 5]], units="m") + self.assertIn(4, d) + self.assertNotIn(40, d) + self.assertIn(cf.Data(3), d) + self.assertIn(cf.Data([[[[3]]]]), d) + value = d[1, 2] + value.Units *= 2 + value.squeeze(0) + self.assertIn(value, d) + self.assertIn(np.array([[[2]]]), d) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_asdata(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(self.ma) + d = cf.Data(self.ma) - self.assertIs(d.asdata(d), d) - self.assertIs(cf.Data.asdata(d), d) - self.assertIs(d.asdata(d, dtype=d.dtype), d) - self.assertIs(cf.Data.asdata(d, dtype=d.dtype), d) + self.assertIs(d.asdata(d), d) + self.assertIs(cf.Data.asdata(d), d) + self.assertIs(d.asdata(d, dtype=d.dtype), d) + self.assertIs(cf.Data.asdata(d, dtype=d.dtype), d) - self.assertIsNot(d.asdata(d, dtype="float32"), d) - self.assertIsNot(cf.Data.asdata(d, dtype="float32"), d) - self.assertIsNot(d.asdata(d, dtype=d.dtype, copy=True), d) - self.assertIsNot( - cf.Data.asdata(d, dtype=d.dtype, copy=True), d - ) + self.assertIsNot(d.asdata(d, dtype="float32"), d) + self.assertIsNot(cf.Data.asdata(d, dtype="float32"), d) + self.assertIsNot(d.asdata(d, dtype=d.dtype, copy=True), d) + self.assertIsNot(cf.Data.asdata(d, dtype=d.dtype, copy=True), d) - self.assertTrue( - cf.Data.asdata( - cf.Data([1, 2, 3]), dtype=float, copy=True - ).equals(cf.Data([1.0, 2, 3]), verbose=2) - ) + self.assertTrue( + cf.Data.asdata(cf.Data([1, 2, 3]), dtype=float, copy=True).equals( + cf.Data([1.0, 2, 3]), verbose=2 + ) + ) - self.assertTrue( - cf.Data.asdata([1, 2, 3]).equals( - cf.Data([1, 2, 3]), verbose=2 - ) - ) - self.assertTrue( - cf.Data.asdata([1, 2, 3], dtype=float).equals( - cf.Data([1.0, 2, 3]), verbose=2 - ) - ) + self.assertTrue( + cf.Data.asdata([1, 2, 3]).equals(cf.Data([1, 2, 3]), verbose=2) + ) + self.assertTrue( + cf.Data.asdata([1, 2, 3], dtype=float).equals( + cf.Data([1.0, 2, 3]), verbose=2 + ) + ) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_squeeze_insert_dimension(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data([list(range(1000))]) - self.assertEqual(d.shape, (1, 1000)) - e = d.squeeze() - self.assertEqual(e.shape, (1000,)) - self.assertIsNone(d.squeeze(inplace=True)) - self.assertEqual(d.shape, (1000,)) - - d = cf.Data([list(range(1000))]) - d.transpose(inplace=True) - self.assertEqual(d.shape, (1000, 1)) - e = d.squeeze() - self.assertEqual(e.shape, (1000,)) - self.assertIsNone(d.squeeze(inplace=True)) - self.assertEqual(d.shape, (1000,)) - - d.insert_dimension(0, inplace=True) - d.insert_dimension(-1, inplace=True) - self.assertEqual(d.shape, (1, 1000, 1)) - e = d.squeeze() - self.assertEqual(e.shape, (1000,)) - e = d.squeeze(-1) - self.assertEqual( - e.shape, - ( - 1, - 1000, - ), - ) - self.assertIsNone(e.squeeze(0, inplace=True)) - self.assertEqual(e.shape, (1000,)) + d = cf.Data([list(range(1000))]) + self.assertEqual(d.shape, (1, 1000)) + e = d.squeeze() + self.assertEqual(e.shape, (1000,)) + self.assertIsNone(d.squeeze(inplace=True)) + self.assertEqual(d.shape, (1000,)) + + d = cf.Data([list(range(1000))]) + d.transpose(inplace=True) + self.assertEqual(d.shape, (1000, 1)) + e = d.squeeze() + self.assertEqual(e.shape, (1000,)) + self.assertIsNone(d.squeeze(inplace=True)) + self.assertEqual(d.shape, (1000,)) + + d.insert_dimension(0, inplace=True) + d.insert_dimension(-1, inplace=True) + self.assertEqual(d.shape, (1, 1000, 1)) + e = d.squeeze() + self.assertEqual(e.shape, (1000,)) + e = d.squeeze(-1) + self.assertEqual(e.shape, (1, 1000)) + self.assertIsNone(e.squeeze(0, inplace=True)) + self.assertEqual(e.shape, (1000,)) + + d = e + d.insert_dimension(0, inplace=True) + d.insert_dimension(-1, inplace=True) + d.insert_dimension(-1, inplace=True) + self.assertEqual(d.shape, (1, 1000, 1, 1)) + e = d.squeeze([0, 2]) + self.assertEqual(e.shape, (1000, 1)) + + array = np.arange(1000).reshape(1, 100, 10) + d = cf.Data(array) + e = d.squeeze() + f = e.insert_dimension(0) + a = f.array + self.assertTrue(np.allclose(a, array)) + + def test_Data__getitem__(self): + if self.test_only and inspect.stack()[0][3] not in self.test_only: + return + + d = cf.Data(np.ma.arange(450).reshape(9, 10, 5), chunks=(4, 5, 1)) + + for indices in ( + Ellipsis, + (slice(None), slice(None)), + (slice(None), Ellipsis), + (Ellipsis, slice(None)), + (Ellipsis, slice(None), Ellipsis), + ): + self.assertEqual(d[indices].shape, d.shape) + + for indices in ( + ([1, 3, 4], slice(None), [2, -1]), + (slice(0, 6, 2), slice(None), [2, -1]), + (slice(0, 6, 2), slice(None), slice(2, 5, 2)), + (slice(0, 6, 2), list(range(10)), slice(2, 5, 2)), + ): + self.assertEqual(d[indices].shape, (3, 10, 2)) - d = e - d.insert_dimension(0, inplace=True) - d.insert_dimension(-1, inplace=True) - d.insert_dimension(-1, inplace=True) - self.assertEqual(d.shape, (1, 1000, 1, 1)) - e = d.squeeze([0, 2]) - self.assertEqual(e.shape, (1000, 1)) + for indices in ( + (slice(0, 6, 2), -2, [2, -1]), + (slice(0, 6, 2), -2, slice(2, 5, 2)), + ): + self.assertEqual(d[indices].shape, (3, 1, 2)) - array = numpy.arange(1000).reshape(1, 100, 10) - d = cf.Data(array) - e = d.squeeze() - f = e.insert_dimension(0) - a = f.array - self.assertTrue(numpy.allclose(a, array)) + for indices in ( + ([1, 3, 4], -2, [2, -1]), + ([4, 3, 1], -2, [2, -1]), + ([1, 4, 3], -2, [2, -1]), + ([4, 1, 4], -2, [2, -1]), + ): + e = d[indices] + self.assertEqual(e.shape, (3, 1, 2)) + self.assertEqual(e._axes, d._axes) + + d.__keepdims_indexing__ = False + self.assertFalse(d.__keepdims_indexing__) + for indices in ( + ([1, 3, 4], -2, [2, -1]), + (slice(0, 6, 2), -2, [2, -1]), + (slice(0, 6, 2), -2, slice(2, 5, 2)), + ([1, 4, 3], -2, [2, -1]), + ([4, 3, 4], -2, [2, -1]), + ([1, 4, 4], -2, [2, -1]), + ): + e = d[indices] + self.assertFalse(e.__keepdims_indexing__) + self.assertEqual(e.shape, (3, 2)) + self.assertEqual(e._axes, d._axes[0::2]) + + self.assertFalse(d.__keepdims_indexing__) + d.__keepdims_indexing__ = True + self.assertTrue(d.__keepdims_indexing__) + + d = cf.Data(np.ma.arange(24).reshape(3, 8)) + e = d[0, 2:4] + + # Cyclic slices + d = cf.Data(np.ma.arange(24).reshape(3, 8)) + d.cyclic(1) + self.assertTrue((d[0, :6].array == [[0, 1, 2, 3, 4, 5]]).all()) + e = d[0, -2:4] + self.assertEqual(e._axes, d._axes) + self.assertEqual(e.shape, (1, 6)) + self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3]]).all()) + self.assertFalse(e.cyclic()) + + d.__keepdims_indexing__ = False + e = d[:, 4] + self.assertEqual(e.shape, (3,)) + self.assertFalse(e.cyclic()) + self.assertEqual(e._axes, d._axes[0:1]) + d.__keepdims_indexing__ = True + + e = d[0, -2:6] + self.assertEqual(e.shape, (1, 8)) + self.assertTrue((e[0].array == [[6, 7, 0, 1, 2, 3, 4, 5]]).all()) + self.assertTrue(e.cyclic(), set([1])) + + with self.assertRaises(IndexError): + # Cyclic slice of non-cyclic axis + e = d[-1:1] + + d.cyclic(0) + e = d[-1:1, -2:-4] + self.assertEqual(e.shape, (2, 6)) + self.assertTrue((e[:, 0].array == [[22], [6]]).all()) + self.assertTrue((e[0].array == [[22, 23, 16, 17, 18, 19]]).all()) + self.assertFalse(e.cyclic()) + + e = d[-1:2, -2:4] + self.assertEqual(e.shape, (3, 6)) + self.assertEqual(e.cyclic(), set([0])) + e = d[-1:1, -2:6] + self.assertEqual(e.shape, (2, 8)) + self.assertEqual(e.cyclic(), set([1])) + e = d[-1:2, -2:6] + self.assertEqual(e.shape, (3, 8)) + self.assertEqual(e.cyclic(), set([0, 1])) + + d.cyclic(0, False) + d.__keepdims_indexing__ = False + e = d[0, :6] + self.assertFalse(e.__keepdims_indexing__) + self.assertEqual(e.shape, (6,)) + self.assertTrue((e.array == [0, 1, 2, 3, 4, 5]).all()) + e = d[0, -2:4] + self.assertEqual(e.shape, (6,)) + self.assertTrue((e.array == [6, 7, 0, 1, 2, 3]).all()) + self.assertFalse(e.cyclic()) + d.__keepdims_indexing__ = True + + # Keepdims indexing + d = cf.Data([[1, 2, 3], [4, 5, 6]]) + self.assertEqual(d[0].shape, (1, 3)) + self.assertEqual(d[:, 1].shape, (2, 1)) + self.assertEqual(d[0, 1].shape, (1, 1)) + d.__keepdims_indexing__ = False + self.assertEqual(d[0].shape, (3,)) + self.assertEqual(d[:, 1].shape, (2,)) + self.assertEqual(d[0, 1].shape, ()) + d.__keepdims_indexing__ = True + + # Orthogonal indexing + self.assertEqual(d[[0], [0, 2]].shape, (1, 2)) + self.assertEqual(d[[0, 1], [0, 2]].shape, (2, 2)) + self.assertEqual(d[[0, 1], [2]].shape, (2, 1)) + + # Ancillary masks + # + # TODODASK: Test __getitem__ with ancillary masks. Can only do + # this when cf.Data.where has been daskified - def test_Data___getitem__(self): + def test_Data__setitem__(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - def test_Data___setitem__(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return + for hardmask in (False, True): + a = np.ma.arange(90).reshape(9, 10) + if hardmask: + a.harden_mask() + else: + a.soften_mask() - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - for hardmask in (False, True): - a = numpy.ma.arange(3000).reshape(50, 60) - if hardmask: - a.harden_mask() - else: - a.soften_mask() + d = cf.Data(a.copy(), "metres", hardmask=hardmask, chunks=(3, 5)) - d = cf.Data(a.filled(), "m") - d.hardmask = hardmask + a[:, 1] = np.ma.masked + d[:, 1] = cf.masked - for n, (j, i) in enumerate( - ( - (34, 23), - (0, 0), - (-1, -1), - (slice(40, 50), slice(58, 60)), - (Ellipsis, slice(None)), - (slice(None), Ellipsis), - ) - ): - n = -n - 1 - for dvalue, avalue in ( - (n, n), - (cf.masked, numpy.ma.masked), - (n, n), - ): - message = ( - "hardmask={}, " - "cf.Data[{}, {}]]={}={} failed".format( - hardmask, j, i, dvalue, avalue - ) - ) - d[j, i] = dvalue - a[j, i] = avalue - - self.assertIn( - (d.array == a).all(), - (True, numpy.ma.masked), - message, - ) - self.assertTrue( - ( - d.mask.array == numpy.ma.getmaskarray(a) - ).all(), - "d.mask.array={!r} \n" - "numpy.ma.getmaskarray(a)={!r}".format( - d.mask.array, numpy.ma.getmaskarray(a) - ), - ) - # --- End: for - - a = numpy.ma.arange(3000).reshape(50, 60) - if hardmask: - a.harden_mask() - else: - a.soften_mask() - - d = cf.Data(a.filled(), "m") - d.hardmask = hardmask - - (j, i) = (slice(0, 2), slice(0, 3)) - array = numpy.array([[1, 2, 6], [3, 4, 5]]) * -1 - for dvalue in ( - array, - numpy.ma.masked_where(array < -2, array), - array, - ): - message = "cf.Data[{}, {}]={} failed".format( - j, i, dvalue - ) - d[j, i] = dvalue - a[j, i] = dvalue + a[0, 2] = -6 + d[0, 2] = -6 - self.assertIn( - (d.array == a).all(), - (True, numpy.ma.masked), - message, - ) - self.assertTrue( - (d.mask.array == numpy.ma.getmaskarray(a)).all(), - message, - ) + a[0:3, 1] = -1 + d[0:3, 1] = -1 + + a[0:2, 3] = -1 + d[0:2, 3] = -1 + + a[3, 4:6] = -2 + d[3, 4:6] = -2 + + a[0:2, 1:4] = -3 + d[0:2, 1:4] = -3 + + a[5:7, [3, 5, 6]] = -4 + d[5:7, [3, 5, 6]] = -4 + a[8, [8, 6, 5]] = -5 + d[8, [8, 6, 5]] = -5 + + a[...] = -a + d[...] = -d + + a[0] = a[2] + d[0] = d[2] + + self.assertTrue((d.array == a).all()) + self.assertTrue((d.array.mask == a.mask).all()) + + # Units + a = np.ma.arange(90).reshape(9, 10) + d = cf.Data(a, "metres") + d[...] = cf.Data(a * 100, "cm") + self.assertTrue((d.array == a).all()) + self.assertTrue((d.array.mask == a.mask).all()) + + # Cyclic axes + d.cyclic(1) + self.assertTrue((d[0].array == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).all()) + d[0, -1:1] = [-99, -1] + self.assertTrue( + (d[0].array == [-1, 1, 2, 3, 4, 5, 6, 7, 8, -99]).all() + ) + self.assertEqual(d.cyclic(), set([1])) + + # Multiple list/1-d array indices + with self.assertRaises(NotImplementedError): + d[[1, 2], [0, 4, 1]] = 9 + + with self.assertRaises(NotImplementedError): + d[[1], [0, 4, 1]] = 9 + + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_outerproduct(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(numpy.arange(1200).reshape(40, 30)) + d = cf.Data(np.arange(1200).reshape(40, 30)) - e = cf.Data(numpy.arange(5)) - f = d.outerproduct(e) - self.assertEqual(f.shape, (40, 30, 5)) + e = cf.Data(np.arange(5)) + f = d.outerproduct(e) + self.assertEqual(f.shape, (40, 30, 5)) - e = cf.Data(numpy.arange(5).reshape(5, 1)) - f = d.outerproduct(e) - self.assertEqual(f.shape, (40, 30, 5, 1)) + e = cf.Data(np.arange(5).reshape(5, 1)) + f = d.outerproduct(e) + self.assertEqual(f.shape, (40, 30, 5, 1)) - e = cf.Data(numpy.arange(30).reshape(6, 5)) - f = d.outerproduct(e) - self.assertEqual(f.shape, (40, 30, 6, 5)) + e = cf.Data(np.arange(30).reshape(6, 5)) + f = d.outerproduct(e) + self.assertEqual(f.shape, (40, 30, 6, 5)) - e = cf.Data(7) - f = d.outerproduct(e) - self.assertEqual(f.shape, (40, 30), f.shape) + e = cf.Data(7) + f = d.outerproduct(e) + self.assertEqual(f.shape, (40, 30), f.shape) - e = cf.Data(numpy.arange(5)) - self.assertIsNone(d.outerproduct(e, inplace=True)) - self.assertEqual(d.shape, (40, 30, 5), d.shape) + e = cf.Data(np.arange(5)) + self.assertIsNone(d.outerproduct(e, inplace=True)) + self.assertEqual(d.shape, (40, 30, 5), d.shape) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_all(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(numpy.array([[0] * 1000])) - self.assertTrue(not d.all()) - d[-1, -1] = 1 - self.assertFalse(d.all()) - d[...] = 1 - self.assertTrue(d.all()) - d[...] = cf.masked - self.assertTrue(d.all()) + d = cf.Data(np.array([[0] * 1000])) + self.assertTrue(not d.all()) + d[-1, -1] = 1 + self.assertFalse(d.all()) + d[...] = 1 + self.assertTrue(d.all()) + d[...] = cf.masked + self.assertTrue(d.all()) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_any(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(numpy.array([[0] * 1000])) - self.assertFalse(d.any()) - d[-1, -1] = 1 - self.assertTrue(d.any()) - d[...] = 1 - self.assertTrue(d.any()) - d[...] = cf.masked - self.assertFalse(d.any()) + d = cf.Data(np.array([[0] * 1000])) + self.assertFalse(d.any()) + d[-1, -1] = 1 + self.assertTrue(d.any()) + d[...] = 1 + self.assertTrue(d.any()) + d[...] = cf.masked + self.assertFalse(d.any()) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "AssertionError: -999 != 0") def test_Data_array(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -1014,51 +1078,47 @@ def test_Data_array(self): d = cf.Data(9, "km") a = d.array self.assertEqual(a.shape, ()) - self.assertEqual(a, numpy.array(9)) + self.assertEqual(a, np.array(9)) d[...] = cf.masked a = d.array self.assertEqual(a.shape, ()) - self.assertIs(a[()], numpy.ma.masked) + self.assertIs(a[()], np.ma.masked) # Non-scalar numeric array - b = numpy.arange(10 * 15 * 19).reshape(10, 1, 15, 19) - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(b, "km") - a = d.array - a[0, 0, 0, 0] = -999 - a2 = d.array - self.assertEqual(a2[0, 0, 0, 0], 0) - self.assertEqual(a2.shape, b.shape) - self.assertTrue((a2 == b).all()) - self.assertFalse((a2 == a).all()) - - d = cf.Data( - [["2000-12-3 12:00"]], "days since 2000-12-01", dt=True - ) - a = d.array - self.assertTrue((a == numpy.array([[2.5]])).all()) + b = np.arange(10 * 15 * 19).reshape(10, 1, 15, 19) + d = cf.Data(b, "km") + a = d.array + a[0, 0, 0, 0] = -999 + a2 = d.array + self.assertEqual(a2[0, 0, 0, 0], 0) + self.assertEqual(a2.shape, b.shape) + self.assertTrue((a2 == b).all()) + self.assertFalse((a2 == a).all()) + + d = cf.Data([["2000-12-3 12:00"]], "days since 2000-12-01", dt=True) + a = d.array + self.assertTrue((a == np.array([[2.5]])).all()) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_binary_mask(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - a = numpy.ma.ones((1000,), dtype="int32") - a[[1, 900]] = numpy.ma.masked + a = np.ma.ones((1000,), dtype="int32") + a[[1, 900]] = np.ma.masked a[[0, 10, 910]] = 0 - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(numpy.arange(1000.0), "radians") - d[[1, 900]] = cf.masked - d[[10, 910]] = 0 + d = cf.Data(np.arange(1000.0), "radians") + d[[1, 900]] = cf.masked + d[[10, 910]] = 0 - b = d.binary_mask + b = d.binary_mask - self.assertEqual(b.Units, cf.Units("1")) - self.assertEqual(b.dtype, numpy.dtype("int32")) - self.assertTrue((b.array == a).all()) + self.assertEqual(b.Units, cf.Units("1")) + self.assertEqual(b.dtype, np.dtype("int32")) + self.assertTrue((b.array == a).all()) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_clip(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -1067,24 +1127,23 @@ def test_Data_clip(self): c1 = 34.345456567 a = self.a + 0.34567 - ac = numpy.clip(a, c0, c1) + ac = np.clip(a, c0, c1) - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(a, "km") - self.assertIsNotNone(d.clip(c0, c1)) - self.assertIsNone(d.clip(c0, c1, inplace=True)) + d = cf.Data(a, "km") + self.assertIsNotNone(d.clip(c0, c1)) + self.assertIsNone(d.clip(c0, c1, inplace=True)) - d = cf.Data(a, "km") - e = d.clip(c0, c1) - self.assertTrue((e.array == ac).all()) + d = cf.Data(a, "km") + e = d.clip(c0, c1) + self.assertTrue((e.array == ac).all()) - e = d.clip(c0 * 1000, c1 * 1000, units="m") - self.assertTrue((e.array == ac).all()) + e = d.clip(c0 * 1000, c1 * 1000, units="m") + self.assertTrue((e.array == ac).all()) - d.clip(c0 * 100, c1 * 100, units="10m", inplace=True) - self.assertTrue(d.allclose(ac, rtol=1e-05, atol=1e-08)) + d.clip(c0 * 100, c1 * 100, units="10m", inplace=True) + self.assertTrue(d.allclose(ac, rtol=1e-05, atol=1e-08)) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_months_years(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -1094,8 +1153,8 @@ def test_Data_months_years(self): [1.0, 2], units=cf.Units("months since 2000-1-1", calendar=calendar), ) - self.assertTrue((d.array == numpy.array([1.0, 2])).all()) - a = numpy.array( + self.assertTrue((d.array == np.array([1.0, 2])).all()) + a = np.array( [ cf.dt(2000, 2, 1, 10, 29, 3, 831223, calendar=calendar), cf.dt(2000, 3, 1, 20, 58, 7, 662446, calendar=calendar), @@ -1111,8 +1170,8 @@ def test_Data_months_years(self): [1.0, 2], units=cf.Units("months since 2000-1-1", calendar=calendar), ) - self.assertTrue((d.array == numpy.array([1.0, 2])).all()) - a = numpy.array( + self.assertTrue((d.array == np.array([1.0, 2])).all()) + a = np.array( [ cf.dt(2000, 1, 31, 10, 29, 3, 831223, calendar=calendar), cf.dt(2000, 3, 1, 20, 58, 7, 662446, calendar=calendar), @@ -1126,8 +1185,8 @@ def test_Data_months_years(self): d = cf.Data( [1.0, 2], units=cf.Units("years since 2000-1-1", calendar=calendar) ) - self.assertTrue((d.array == numpy.array([1.0, 2])).all()) - a = numpy.array( + self.assertTrue((d.array == np.array([1.0, 2])).all()) + a = np.array( [ cf.dt(2001, 1, 6, 5, 48, 45, 974678, calendar=calendar), cf.dt(2002, 1, 11, 11, 37, 31, 949357, calendar=calendar), @@ -1141,8 +1200,8 @@ def test_Data_months_years(self): d = cf.Data( [1.0, 2], units=cf.Units("years since 2000-1-1", calendar=calendar) ) - self.assertTrue((d.array == numpy.array([1.0, 2])).all()) - a = numpy.array( + self.assertTrue((d.array == np.array([1.0, 2])).all()) + a = np.array( [ cf.dt(2000, 12, 31, 5, 48, 45, 974678, calendar=calendar), cf.dt(2001, 12, 31, 11, 37, 31, 949357, calendar=calendar), @@ -1158,6 +1217,7 @@ def test_Data_months_years(self): ) d *= 31 + @unittest.skipIf(TEST_DASKIFIED_ONLY, "'NoneType' object is not callable") def test_Data_datetime_array(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -1173,7 +1233,7 @@ def test_Data_datetime_array(self): a = d.datetime_array self.assertEqual(a.shape, ()) self.assertEqual( - a, numpy.array(cf.dt("2000-12-1 12:00", calendar="standard")) + a, np.array(cf.dt("2000-12-1 12:00", calendar="standard")) ) a = d.array @@ -1203,7 +1263,7 @@ def test_Data_datetime_array(self): self.assertTrue( ( a - == numpy.array( + == np.array( [ [ cf.dt("2000-12-1 12:00", calendar="standard"), @@ -1214,193 +1274,181 @@ def test_Data_datetime_array(self): ).all() ) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data__asdatetime__asreftime__isdatetime(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data([[1.93, 5.17]], "days since 2000-12-29") - self.assertEqual(d.dtype, numpy.dtype(float)) - self.assertFalse(d._isdatetime()) + d = cf.Data([[1.93, 5.17]], "days since 2000-12-29") + self.assertEqual(d.dtype, np.dtype(float)) + self.assertFalse(d._isdatetime()) - self.assertIsNone(d._asreftime(inplace=True)) - self.assertEqual(d.dtype, numpy.dtype(float)) - self.assertFalse(d._isdatetime()) + self.assertIsNone(d._asreftime(inplace=True)) + self.assertEqual(d.dtype, np.dtype(float)) + self.assertFalse(d._isdatetime()) - self.assertIsNone(d._asdatetime(inplace=True)) - self.assertEqual(d.dtype, numpy.dtype(object)) - self.assertTrue(d._isdatetime()) + self.assertIsNone(d._asdatetime(inplace=True)) + self.assertEqual(d.dtype, np.dtype(object)) + self.assertTrue(d._isdatetime()) - self.assertIsNone(d._asdatetime(inplace=True)) - self.assertEqual(d.dtype, numpy.dtype(object)) - self.assertTrue(d._isdatetime()) + self.assertIsNone(d._asdatetime(inplace=True)) + self.assertEqual(d.dtype, np.dtype(object)) + self.assertTrue(d._isdatetime()) - self.assertIsNone(d._asreftime(inplace=True)) - self.assertEqual(d.dtype, numpy.dtype(float)) - self.assertFalse(d._isdatetime()) + self.assertIsNone(d._asreftime(inplace=True)) + self.assertEqual(d.dtype, np.dtype(float)) + self.assertFalse(d._isdatetime()) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_ceil(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return for x in (1, -1): a = 0.9 * x * self.a - c = numpy.ceil(a) - - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(a) - e = d.ceil() - self.assertIsNone(d.ceil(inplace=True)) - self.assertTrue(d.equals(e, verbose=2)) - self.assertEqual(d.shape, c.shape) - self.assertTrue((d.array == c).all()) + c = np.ceil(a) + + d = cf.Data(a) + e = d.ceil() + self.assertIsNone(d.ceil(inplace=True)) + self.assertTrue(d.equals(e, verbose=2)) + self.assertEqual(d.shape, c.shape) + self.assertTrue((d.array == c).all()) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_floor(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return for x in (1, -1): a = 0.9 * x * self.a - c = numpy.floor(a) - - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(a) - e = d.floor() - self.assertIsNone(d.floor(inplace=True)) - self.assertTrue(d.equals(e, verbose=2)) - self.assertEqual(d.shape, c.shape) - self.assertTrue((d.array == c).all()) + c = np.floor(a) + + d = cf.Data(a) + e = d.floor() + self.assertIsNone(d.floor(inplace=True)) + self.assertTrue(d.equals(e, verbose=2)) + self.assertEqual(d.shape, c.shape) + self.assertTrue((d.array == c).all()) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_trunc(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return for x in (1, -1): a = 0.9 * x * self.a - c = numpy.trunc(a) - - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(a) - e = d.trunc() - self.assertIsNone(d.trunc(inplace=True)) - self.assertTrue(d.equals(e, verbose=2)) - self.assertEqual(d.shape, c.shape) - self.assertTrue((d.array == c).all()) + c = np.trunc(a) + + d = cf.Data(a) + e = d.trunc() + self.assertIsNone(d.trunc(inplace=True)) + self.assertTrue(d.equals(e, verbose=2)) + self.assertEqual(d.shape, c.shape) + self.assertTrue((d.array == c).all()) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_rint(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return for x in (1, -1): a = 0.9 * x * self.a - c = numpy.rint(a) - - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(a) - d0 = d.copy() - e = d.rint() - x = e.array - - self.assertTrue((x == c).all()) - self.assertTrue(d.equals(d0, verbose=2)) - self.assertIsNone(d.rint(inplace=True)) - self.assertTrue(d.equals(e, verbose=2)) - self.assertEqual(d.shape, c.shape) - self.assertTrue((d.array == c).all()) + c = np.rint(a) + + d = cf.Data(a) + d0 = d.copy() + e = d.rint() + x = e.array + self.assertTrue((x == c).all()) + self.assertTrue(d.equals(d0, verbose=2)) + self.assertIsNone(d.rint(inplace=True)) + self.assertTrue(d.equals(e, verbose=2)) + self.assertEqual(d.shape, c.shape) + self.assertTrue((d.array == c).all()) + + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_round(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return for decimals in range(-8, 8): a = self.a + 0.34567 - c = numpy.round(a, decimals=decimals) + c = np.round(a, decimals=decimals) - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(a) - e = d.round(decimals=decimals) + d = cf.Data(a) + e = d.round(decimals=decimals) - self.assertIsNone(d.round(decimals=decimals, inplace=True)) + self.assertIsNone(d.round(decimals=decimals, inplace=True)) - self.assertTrue(d.equals(e, verbose=2)) - self.assertEqual(d.shape, c.shape) - self.assertTrue((d.array == c).all()) + self.assertTrue(d.equals(e, verbose=2)) + self.assertEqual(d.shape, c.shape) + self.assertTrue((d.array == c).all()) def test_Data_datum(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(5, "metre") - self.assertEqual(d.datum(), 5) - self.assertEqual(d.datum(0), 5) - self.assertEqual(d.datum(-1), 5) - - for d in [ - cf.Data([4, 5, 6, 1, 2, 3], "metre"), - cf.Data([[4, 5, 6], [1, 2, 3]], "metre"), - ]: - self.assertEqual(d.datum(0), 4) - self.assertEqual(d.datum(-1), 3) - for index in d.ndindex(): - self.assertEqual(d.datum(index), d.array[index].item()) - self.assertEqual( - d.datum(*index), - d.array[index].item(), - "{}, {}".format( - d.datum(*index), d.array[index].item() - ), - ) - # --- End: for - - d = cf.Data(5, "metre") - d[()] = cf.masked - self.assertIs(d.datum(), cf.masked) - self.assertIs(d.datum(0), cf.masked) - self.assertIs(d.datum(-1), cf.masked) - - d = cf.Data([[5]], "metre") - d[0, 0] = cf.masked - self.assertIs(d.datum(), cf.masked) - self.assertIs(d.datum(0), cf.masked) - self.assertIs(d.datum(-1), cf.masked) - self.assertIs(d.datum(0, 0), cf.masked) - self.assertIs(d.datum(-1, 0), cf.masked) - self.assertIs(d.datum((0, 0)), cf.masked) - self.assertIs(d.datum([0, -1]), cf.masked) - self.assertIs(d.datum(-1, -1), cf.masked) + d = cf.Data(5, "metre") + self.assertEqual(d.datum(), 5) + self.assertEqual(d.datum(0), 5) + self.assertEqual(d.datum(-1), 5) + + for d in [ + cf.Data([4, 5, 6, 1, 2, 3], "metre"), + cf.Data([[4, 5, 6], [1, 2, 3]], "metre"), + ]: + self.assertEqual(d.datum(0), 4) + self.assertEqual(d.datum(-1), 3) + for index in d.ndindex(): + self.assertEqual(d.datum(index), d.array[index].item()) + self.assertEqual( + d.datum(*index), + d.array[index].item(), + "{}, {}".format(d.datum(*index), d.array[index].item()), + ) + # --- End: for + + d = cf.Data(5, "metre") + d[()] = cf.masked + self.assertIs(d.datum(), cf.masked) + self.assertIs(d.datum(0), cf.masked) + self.assertIs(d.datum(-1), cf.masked) + d = cf.Data([[5]], "metre") + d[0, 0] = cf.masked + self.assertIs(d.datum(), cf.masked) + self.assertIs(d.datum(0), cf.masked) + self.assertIs(d.datum(-1), cf.masked) + self.assertIs(d.datum(0, 0), cf.masked) + self.assertIs(d.datum(-1, 0), cf.masked) + self.assertIs(d.datum((0, 0)), cf.masked) + self.assertIs(d.datum([0, -1]), cf.masked) + self.assertIs(d.datum(-1, -1), cf.masked) + + @unittest.skipIf(TEST_DASKIFIED_ONLY, "TypeError: 'int' is not iterable") def test_Data_flip(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - array = numpy.arange(24000).reshape(120, 200) - d = cf.Data(array.copy(), "metre") + array = np.arange(24000).reshape(120, 200) + d = cf.Data(array.copy(), "metre") - for axes, indices in zip( - (0, 1, [0, 1]), - ( - (slice(None, None, -1), slice(None)), - (slice(None), slice(None, None, -1)), - (slice(None, None, -1), slice(None, None, -1)), - ), - ): - array = array[indices] - d.flip(axes, inplace=True) + for axes, indices in zip( + (0, 1, [0, 1]), + ( + (slice(None, None, -1), slice(None)), + (slice(None), slice(None, None, -1)), + (slice(None, None, -1), slice(None, None, -1)), + ), + ): + array = array[indices] + d.flip(axes, inplace=True) - self.assertTrue((d.array == array).all()) - # --- End: for + self.assertTrue((d.array == array).all()) - array = numpy.arange(3 * 4 * 5).reshape(3, 4, 5) + 1 + array = np.arange(3 * 4 * 5).reshape(3, 4, 5) + 1 d = cf.Data(array.copy(), "metre", chunk=False) d.chunk(total=[0], omit_axes=[1, 2]) @@ -1426,159 +1474,132 @@ def test_Data_flip(self): self.assertEqual(e[0].maximum(), 3 * 4 * 5) self.assertEqual(e[-1].maximum(), 4 * 5) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute 'datum'") def test_Data_max(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - for pp in (False, True): - with cf.chunksize(chunksize): - d = cf.Data([[4, 5, 6], [1, 2, 3]], "metre") - self.assertEqual( - d.maximum(_preserve_partitions=pp), cf.Data(6, "metre") - ) - self.assertEqual( - d.maximum(_preserve_partitions=pp).datum(), 6 - ) - d[0, 2] = cf.masked - self.assertEqual(d.maximum(_preserve_partitions=pp), 5) - self.assertEqual( - d.maximum(_preserve_partitions=pp).datum(), 5 - ) - self.assertEqual( - d.maximum(_preserve_partitions=pp), - cf.Data(0.005, "km"), - ) + for pp in (False, True): + d = cf.Data([[4, 5, 6], [1, 2, 3]], "metre") + self.assertEqual( + d.maximum(_preserve_partitions=pp), cf.Data(6, "metre") + ) + self.assertEqual(d.maximum(_preserve_partitions=pp).datum(), 6) + d[0, 2] = cf.masked + self.assertEqual(d.maximum(_preserve_partitions=pp), 5) + self.assertEqual(d.maximum(_preserve_partitions=pp).datum(), 5) + self.assertEqual( + d.maximum(_preserve_partitions=pp), cf.Data(0.005, "km") + ) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_min(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - for pp in (False, True): - with cf.chunksize(chunksize): - d = cf.Data([[4, 5, 6], [1, 2, 3]], "metre") - self.assertEqual( - d.minimum(_preserve_partitions=pp), cf.Data(1, "metre") - ) - self.assertEqual( - d.minimum(_preserve_partitions=pp).datum(), 1 - ) - d[1, 0] = cf.masked - self.assertEqual(d.minimum(_preserve_partitions=pp), 2) - self.assertEqual( - d.minimum(_preserve_partitions=pp).datum(), 2 - ) - self.assertEqual( - d.minimum(_preserve_partitions=pp), - cf.Data(0.002, "km"), - ) + for pp in (False, True): + d = cf.Data([[4, 5, 6], [1, 2, 3]], "metre") + self.assertEqual( + d.minimum(_preserve_partitions=pp), cf.Data(1, "metre") + ) + self.assertEqual(d.minimum(_preserve_partitions=pp).datum(), 1) + d[1, 0] = cf.masked + self.assertEqual(d.minimum(_preserve_partitions=pp), 2) + self.assertEqual(d.minimum(_preserve_partitions=pp).datum(), 2) + self.assertEqual( + d.minimum(_preserve_partitions=pp), cf.Data(0.002, "km") + ) def test_Data_ndindex(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - cf.chunksize(chunksize) - for d in ( - cf.Data(5, "metre"), - cf.Data([4, 5, 6, 1, 2, 3], "metre"), - cf.Data([[4, 5, 6], [1, 2, 3]], "metre"), - ): - for i, j in zip(d.ndindex(), numpy.ndindex(d.shape)): - self.assertEqual(i, j) - # --- End: for - - cf.chunksize(self.original_chunksize) + for d in ( + cf.Data(5, "metre"), + cf.Data([4, 5, 6, 1, 2, 3], "metre"), + cf.Data([[4, 5, 6], [1, 2, 3]], "metre"), + ): + for i, j in zip(d.ndindex(), np.ndindex(d.shape)): + self.assertEqual(i, j) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_pmshape'") def test_Data_roll(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - a = numpy.arange(10 * 15 * 19).reshape(10, 1, 15, 19) + a = np.arange(10 * 15 * 19).reshape(10, 1, 15, 19) - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(a.copy()) + d = cf.Data(a.copy()) - _ = d._pmshape + _ = d._pmshape - e = d.roll(0, 4) - e.roll(2, 120, inplace=True) - e.roll(3, -77, inplace=True) + e = d.roll(0, 4) + e.roll(2, 120, inplace=True) + e.roll(3, -77, inplace=True) - a = numpy.roll(a, 4, 0) - a = numpy.roll(a, 120, 2) - a = numpy.roll(a, -77, 3) + a = np.roll(a, 4, 0) + a = np.roll(a, 120, 2) + a = np.roll(a, -77, 3) - self.assertEqual(e.shape, a.shape) - self.assertTrue((a == e.array).all()) + self.assertEqual(e.shape, a.shape) + self.assertTrue((a == e.array).all()) - f = e.roll(3, 77) - f.roll(2, -120, inplace=True) - f.roll(0, -4, inplace=True) + f = e.roll(3, 77) + f.roll(2, -120, inplace=True) + f.roll(0, -4, inplace=True) - self.assertEqual(f.shape, d.shape) - self.assertTrue(f.equals(d, verbose=2)) + self.assertEqual(f.shape, d.shape) + self.assertTrue(f.equals(d, verbose=2)) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_swapaxes(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - a = numpy.arange(10 * 15 * 19).reshape(10, 1, 15, 19) + a = np.arange(10 * 15 * 19).reshape(10, 1, 15, 19) - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(a.copy()) + d = cf.Data(a.copy()) - for i in range(-a.ndim, a.ndim): - for j in range(-a.ndim, a.ndim): - b = numpy.swapaxes(a.copy(), i, j) - e = d.swapaxes(i, j) - message = "cf.Data.swapaxes({}, {}) failed".format( - i, j - ) - self.assertEqual(b.shape, e.shape, message) - self.assertTrue((b == e.array).all(), message) + for i in range(-a.ndim, a.ndim): + for j in range(-a.ndim, a.ndim): + b = np.swapaxes(a.copy(), i, j) + e = d.swapaxes(i, j) + message = "cf.Data.swapaxes({}, {}) failed".format(i, j) + self.assertEqual(b.shape, e.shape, message) + self.assertTrue((b == e.array).all(), message) def test_Data_transpose(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - a = numpy.arange(10 * 15 * 19).reshape(10, 1, 15, 19) + a = np.arange(10 * 15 * 19).reshape(10, 1, 15, 19) - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(a.copy()) + d = cf.Data(a.copy()) - for indices in (range(a.ndim), range(-a.ndim, 0)): - for axes in itertools.permutations(indices): - a = numpy.transpose(a, axes) - d.transpose(axes, inplace=True) - message = ( - "cf.Data.transpose({}) failed: " - "d.shape={}, a.shape={}".format( - axes, d.shape, a.shape - ) - ) - self.assertEqual(d.shape, a.shape, message) - self.assertTrue((d.array == a).all(), message) + for indices in (range(a.ndim), range(-a.ndim, 0)): + for axes in itertools.permutations(indices): + a = np.transpose(a, axes) + d.transpose(axes, inplace=True) + message = ( + "cf.Data.transpose({}) failed: " + "d.shape={}, a.shape={}".format(axes, d.shape, a.shape) + ) + self.assertEqual(d.shape, a.shape, message) + self.assertTrue((d.array == a).all(), message) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_unique(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data([[4, 2, 1], [1, 2, 3]], "metre") - self.assertTrue( - (d.unique() == cf.Data([1, 2, 3, 4], "metre")).all() - ) - d[1, -1] = cf.masked - self.assertTrue( - (d.unique() == cf.Data([1, 2, 4], "metre")).all() - ) + d = cf.Data([[4, 2, 1], [1, 2, 3]], "metre") + self.assertTrue((d.unique() == cf.Data([1, 2, 3, 4], "metre")).all()) + d[1, -1] = cf.masked + self.assertTrue((d.unique() == cf.Data([1, 2, 4], "metre")).all()) + @unittest.skipIf( + TEST_DASKIFIED_ONLY, "hits 'TODODASK - use harden_mask/soften_mask'" + ) def test_Data_varray(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -1588,28 +1609,27 @@ def test_Data_varray(self): d.hardmask = False a = d.varray self.assertEqual(a.shape, ()) - self.assertEqual(a, numpy.array(9)) + self.assertEqual(a, np.array(9)) d[...] = cf.masked a = d.varray self.assertEqual(a.shape, ()) - self.assertIs(a[()], numpy.ma.masked) + self.assertIs(a[()], np.ma.masked) a[()] = 18 - self.assertEqual(a, numpy.array(18)) - - b = numpy.arange(10 * 15 * 19).reshape(10, 1, 15, 19) - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(b, "km") - e = d.copy() - v = e.varray - v[0, 0, 0, 0] = -999 - v = e.varray - self.assertEqual(v[0, 0, 0, 0], -999) - self.assertEqual(v.shape, b.shape) - self.assertFalse((v == b).all()) - v[0, 0, 0, 0] = 0 - self.assertTrue((v == b).all()) + self.assertEqual(a, np.array(18)) + b = np.arange(10 * 15 * 19).reshape(10, 1, 15, 19) + d = cf.Data(b, "km") + e = d.copy() + v = e.varray + v[0, 0, 0, 0] = -999 + v = e.varray + self.assertEqual(v[0, 0, 0, 0], -999) + self.assertEqual(v.shape, b.shape) + self.assertFalse((v == b).all()) + v[0, 0, 0, 0] = 0 + self.assertTrue((v == b).all()) + + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_year_month_day_hour_minute_second(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -1632,333 +1652,297 @@ def test_Data_year_month_day_hour_minute_second(self): self.assertTrue(d.minute.equals(cf.Data([[37, 25]]))) self.assertTrue(d.second.equals(cf.Data([[26, 26]]))) - cf.chunksize(self.original_chunksize) - + @unittest.skipIf(TEST_DASKIFIED_ONLY, "'NoneType' is not iterable") def test_Data_BINARY_AND_UNARY_OPERATORS(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - cf.chunksize(chunksize) - array = numpy.arange(3 * 4 * 5).reshape(3, 4, 5) + 1 + array = np.arange(3 * 4 * 5).reshape(3, 4, 5) + 1 - arrays = ( - numpy.arange(3 * 4 * 5).reshape(3, 4, 5) + 1.0, - numpy.arange(3 * 4 * 5).reshape(3, 4, 5) + 1, - ) + arrays = ( + np.arange(3 * 4 * 5).reshape(3, 4, 5) + 1.0, + np.arange(3 * 4 * 5).reshape(3, 4, 5) + 1, + ) - for a0 in arrays: - for a1 in arrays[::-1]: - d = cf.Data( - a0[(slice(None, None, -1),) * a0.ndim], "metre" - ) - d.flip(inplace=True) - x = cf.Data(a1, "metre") + for a0 in arrays: + for a1 in arrays[::-1]: + d = cf.Data(a0[(slice(None, None, -1),) * a0.ndim], "metre") + d.flip(inplace=True) + x = cf.Data(a1, "metre") + + message = "Failed in {!r}+{!r}".format(d, x) + self.assertTrue( + (d + x).equals(cf.Data(a0 + a1, "m"), verbose=1), message + ) + message = "Failed in {!r}*{!r}".format(d, x) + self.assertTrue( + (d * x).equals(cf.Data(a0 * a1, "m2"), verbose=1), message + ) + message = "Failed in {!r}/{!r}".format(d, x) + self.assertTrue( + (d / x).equals(cf.Data(a0 / a1, "1"), verbose=1), message + ) + message = "Failed in {!r}-{!r}".format(d, x) + self.assertTrue( + (d - x).equals(cf.Data(a0 - a1, "m"), verbose=1), message + ) + message = "Failed in {!r}//{!r}".format(d, x) + self.assertTrue( + (d // x).equals(cf.Data(a0 // a1, "1"), verbose=1), message + ) + + message = "Failed in {!r}.__truediv__//{!r}".format(d, x) + self.assertTrue( + d.__truediv__(x).equals( + cf.Data(array.__truediv__(array), "1"), verbose=1 + ), + message, + ) + + message = "Failed in {!r}__rtruediv__{!r}".format(d, x) + self.assertTrue( + d.__rtruediv__(x).equals( + cf.Data(array.__rtruediv__(array), "1"), verbose=1 + ), + message, + ) + + try: + d ** x + except Exception: + pass + else: + message = "Failed in {!r}**{!r}".format(d, x) + self.assertTrue((d ** x).all(), message) + # --- End: for - message = "Failed in {!r}+{!r}".format(d, x) + for a0 in arrays: + d = cf.Data(a0, "metre") + for x in (2, 2.0): + message = "Failed in {!r}+{}".format(d, x) + self.assertTrue( + (d + x).equals(cf.Data(a0 + x, "m"), verbose=1), message + ) + message = "Failed in {!r}*{}".format(d, x) + self.assertTrue( + (d * x).equals(cf.Data(a0 * x, "m"), verbose=1), message + ) + message = "Failed in {!r}/{}".format(d, x) + self.assertTrue( + (d / x).equals(cf.Data(a0 / x, "m"), verbose=1), message + ) + message = "Failed in {!r}-{}".format(d, x) + self.assertTrue( + (d - x).equals(cf.Data(a0 - x, "m"), verbose=1), message + ) + message = "Failed in {!r}//{}".format(d, x) + self.assertTrue( + (d // x).equals(cf.Data(a0 // x, "m"), verbose=1), message + ) + message = "Failed in {!r}**{}".format(d, x) + self.assertTrue( + (d ** x).equals(cf.Data(a0 ** x, "m2"), verbose=1), message + ) + message = "Failed in {!r}.__truediv__{}".format(d, x) + self.assertTrue( + d.__truediv__(x).equals( + cf.Data(a0.__truediv__(x), "m"), verbose=1 + ), + message, + ) + message = "Failed in {!r}.__rtruediv__{}".format(d, x) + self.assertTrue( + d.__rtruediv__(x).equals( + cf.Data(a0.__rtruediv__(x), "m-1"), verbose=1 + ), + message, + ) + + message = "Failed in {}+{!r}".format(x, d) + self.assertTrue( + (x + d).equals(cf.Data(x + a0, "m"), verbose=1), message + ) + message = "Failed in {}*{!r}".format(x, d) + self.assertTrue( + (x * d).equals(cf.Data(x * a0, "m"), verbose=1), message + ) + message = "Failed in {}/{!r}".format(x, d) + self.assertTrue( + (x / d).equals(cf.Data(x / a0, "m-1"), verbose=1), message + ) + message = "Failed in {}-{!r}".format(x, d) + self.assertTrue( + (x - d).equals(cf.Data(x - a0, "m"), verbose=1), message + ) + message = "Failed in {}//{!r}\n{!r}\n{!r}".format( + x, d, x // d, x // a0 + ) + self.assertTrue( + (x // d).equals(cf.Data(x // a0, "m-1"), verbose=1), + message, + ) + + try: + x ** d + except Exception: + pass + else: + message = "Failed in {}**{!r}".format(x, d) + self.assertTrue((x ** d).all(), message) + + a = a0.copy() + try: + a += x + except TypeError: + pass + else: + e = d.copy() + e += x + message = "Failed in {!r}+={}".format(d, x) self.assertTrue( - (d + x).equals(cf.Data(a0 + a1, "m"), verbose=1), - message, + e.equals(cf.Data(a, "m"), verbose=1), message ) - message = "Failed in {!r}*{!r}".format(d, x) + + a = a0.copy() + try: + a *= x + except TypeError: + pass + else: + e = d.copy() + e *= x + message = "Failed in {!r}*={}".format(d, x) self.assertTrue( - (d * x).equals(cf.Data(a0 * a1, "m2"), verbose=1), - message, + e.equals(cf.Data(a, "m"), verbose=1), message ) - message = "Failed in {!r}/{!r}".format(d, x) + + a = a0.copy() + try: + a /= x + except TypeError: + pass + else: + e = d.copy() + e /= x + message = "Failed in {!r}/={}".format(d, x) self.assertTrue( - (d / x).equals(cf.Data(a0 / a1, "1"), verbose=1), - message, + e.equals(cf.Data(a, "m"), verbose=1), message ) - message = "Failed in {!r}-{!r}".format(d, x) + + a = a0.copy() + try: + a -= x + except TypeError: + pass + else: + e = d.copy() + e -= x + message = "Failed in {!r}-={}".format(d, x) self.assertTrue( - (d - x).equals(cf.Data(a0 - a1, "m"), verbose=1), - message, + e.equals(cf.Data(a, "m"), verbose=1), message ) - message = "Failed in {!r}//{!r}".format(d, x) + + a = a0.copy() + try: + a //= x + except TypeError: + pass + else: + e = d.copy() + e //= x + message = "Failed in {!r}//={}".format(d, x) self.assertTrue( - (d // x).equals(cf.Data(a0 // a1, "1"), verbose=1), - message, + e.equals(cf.Data(a, "m"), verbose=1), message ) - message = "Failed in {!r}.__truediv__//{!r}".format(d, x) + a = a0.copy() + try: + a **= x + except TypeError: + pass + else: + e = d.copy() + e **= x + message = "Failed in {!r}**={}".format(d, x) self.assertTrue( - d.__truediv__(x).equals( - cf.Data(array.__truediv__(array), "1"), verbose=1 - ), - message, + e.equals(cf.Data(a, "m2"), verbose=1), message ) - message = "Failed in {!r}__rtruediv__{!r}".format(d, x) + a = a0.copy() + try: + a.__itruediv__(x) + except TypeError: + pass + else: + e = d.copy() + e.__itruediv__(x) + message = "Failed in {!r}.__itruediv__({})".format(d, x) self.assertTrue( - d.__rtruediv__(x).equals( - cf.Data(array.__rtruediv__(array), "1"), verbose=1 - ), - message, + e.equals(cf.Data(a, "m"), verbose=1), message ) - - try: - d ** x - except Exception: - pass - else: - message = "Failed in {!r}**{!r}".format(d, x) - self.assertTrue((d ** x).all(), message) # --- End: for - for a0 in arrays: - d = cf.Data(a0, "metre") - for x in ( - 2, - 2.0, - ): - message = "Failed in {!r}+{}".format(d, x) - self.assertTrue( - (d + x).equals(cf.Data(a0 + x, "m"), verbose=1), - message, - ) - message = "Failed in {!r}*{}".format(d, x) - self.assertTrue( - (d * x).equals(cf.Data(a0 * x, "m"), verbose=1), - message, - ) - message = "Failed in {!r}/{}".format(d, x) - self.assertTrue( - (d / x).equals(cf.Data(a0 / x, "m"), verbose=1), - message, - ) - message = "Failed in {!r}-{}".format(d, x) - self.assertTrue( - (d - x).equals(cf.Data(a0 - x, "m"), verbose=1), - message, - ) - message = "Failed in {!r}//{}".format(d, x) - self.assertTrue( - (d // x).equals(cf.Data(a0 // x, "m"), verbose=1), - message, - ) - message = "Failed in {!r}**{}".format(d, x) + for x in (cf.Data(2, "metre"), cf.Data(2.0, "metre")): + self.assertTrue( + (d + x).equals(cf.Data(a0 + x.datum(), "m"), verbose=1) + ) + self.assertTrue( + (d * x).equals(cf.Data(a0 * x.datum(), "m2"), verbose=1) + ) + self.assertTrue( + (d / x).equals(cf.Data(a0 / x.datum(), "1"), verbose=1) + ) + self.assertTrue( + (d - x).equals(cf.Data(a0 - x.datum(), "m"), verbose=1) + ) + self.assertTrue( + (d // x).equals(cf.Data(a0 // x.datum(), "1"), verbose=1) + ) + + try: + d ** x + except Exception: + pass + else: self.assertTrue( - (d ** x).equals(cf.Data(a0 ** x, "m2"), verbose=1), - message, + (x ** d).all(), "{}**{}".format(x, repr(d)) ) - message = "Failed in {!r}.__truediv__{}".format(d, x) - self.assertTrue( - d.__truediv__(x).equals( - cf.Data(a0.__truediv__(x), "m"), verbose=1 - ), - message, + + self.assertTrue( + d.__truediv__(x).equals( + cf.Data(a0.__truediv__(x.datum()), ""), verbose=1 ) - message = "Failed in {!r}.__rtruediv__{}".format(d, x) - self.assertTrue( - d.__rtruediv__(x).equals( - cf.Data(a0.__rtruediv__(x), "m-1"), verbose=1 - ), - message, - ) - - message = "Failed in {}+{!r}".format(x, d) - self.assertTrue( - (x + d).equals(cf.Data(x + a0, "m"), verbose=1), - message, - ) - message = "Failed in {}*{!r}".format(x, d) - self.assertTrue( - (x * d).equals(cf.Data(x * a0, "m"), verbose=1), - message, - ) - message = "Failed in {}/{!r}".format(x, d) - self.assertTrue( - (x / d).equals(cf.Data(x / a0, "m-1"), verbose=1), - message, - ) - message = "Failed in {}-{!r}".format(x, d) - self.assertTrue( - (x - d).equals(cf.Data(x - a0, "m"), verbose=1), - message, - ) - message = "Failed in {}//{!r}\n{!r}\n{!r}".format( - x, d, x // d, x // a0 - ) - self.assertTrue( - (x // d).equals(cf.Data(x // a0, "m-1"), verbose=1), - message, - ) - - try: - x ** d - except Exception: - pass - else: - message = "Failed in {}**{!r}".format(x, d) - self.assertTrue((x ** d).all(), message) - - a = a0.copy() - try: - a += x - except TypeError: - pass - else: - e = d.copy() - e += x - message = "Failed in {!r}+={}".format(d, x) - self.assertTrue( - e.equals(cf.Data(a, "m"), verbose=1), message - ) - - a = a0.copy() - try: - a *= x - except TypeError: - pass - else: - e = d.copy() - e *= x - message = "Failed in {!r}*={}".format(d, x) - self.assertTrue( - e.equals(cf.Data(a, "m"), verbose=1), message - ) - - a = a0.copy() - try: - a /= x - except TypeError: - pass - else: - e = d.copy() - e /= x - message = "Failed in {!r}/={}".format(d, x) - self.assertTrue( - e.equals(cf.Data(a, "m"), verbose=1), message - ) - - a = a0.copy() - try: - a -= x - except TypeError: - pass - else: - e = d.copy() - e -= x - message = "Failed in {!r}-={}".format(d, x) - self.assertTrue( - e.equals(cf.Data(a, "m"), verbose=1), message - ) - - a = a0.copy() - try: - a //= x - except TypeError: - pass - else: - e = d.copy() - e //= x - message = "Failed in {!r}//={}".format(d, x) - self.assertTrue( - e.equals(cf.Data(a, "m"), verbose=1), message - ) - - a = a0.copy() - try: - a **= x - except TypeError: - pass - else: - e = d.copy() - e **= x - message = "Failed in {!r}**={}".format(d, x) - self.assertTrue( - e.equals(cf.Data(a, "m2"), verbose=1), message - ) - - a = a0.copy() - try: - a.__itruediv__(x) - except TypeError: - pass - else: - e = d.copy() - e.__itruediv__(x) - message = "Failed in {!r}.__itruediv__({})".format( - d, x - ) - self.assertTrue( - e.equals(cf.Data(a, "m"), verbose=1), message - ) - # --- End: for - - for x in (cf.Data(2, "metre"), cf.Data(2.0, "metre")): - self.assertTrue( - (d + x).equals(cf.Data(a0 + x.datum(), "m"), verbose=1) - ) - self.assertTrue( - (d * x).equals( - cf.Data(a0 * x.datum(), "m2"), verbose=1 - ) - ) - self.assertTrue( - (d / x).equals(cf.Data(a0 / x.datum(), "1"), verbose=1) - ) - self.assertTrue( - (d - x).equals(cf.Data(a0 - x.datum(), "m"), verbose=1) - ) - self.assertTrue( - (d // x).equals( - cf.Data(a0 // x.datum(), "1"), verbose=1 - ) - ) - - try: - d ** x - except Exception: - pass - else: - self.assertTrue( - (x ** d).all(), "{}**{}".format(x, repr(d)) - ) - - self.assertTrue( - d.__truediv__(x).equals( - cf.Data(a0.__truediv__(x.datum()), ""), verbose=1 - ) - ) - # --- End: for - - cf.chunksize(self.original_chunksize) + ) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_BROADCASTING(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return A = [ - numpy.array(3), - numpy.array([3]), - numpy.array([3]).reshape(1, 1), - numpy.array([3]).reshape(1, 1, 1), - numpy.arange(5).reshape(5, 1), - numpy.arange(5).reshape(1, 5), - numpy.arange(5).reshape(1, 5, 1), - numpy.arange(5).reshape(5, 1, 1), - numpy.arange(5).reshape(1, 1, 5), - numpy.arange(25).reshape(1, 5, 5), - numpy.arange(25).reshape(5, 1, 5), - numpy.arange(25).reshape(5, 5, 1), - numpy.arange(125).reshape(5, 5, 5), + np.array(3), + np.array([3]), + np.array([3]).reshape(1, 1), + np.array([3]).reshape(1, 1, 1), + np.arange(5).reshape(5, 1), + np.arange(5).reshape(1, 5), + np.arange(5).reshape(1, 5, 1), + np.arange(5).reshape(5, 1, 1), + np.arange(5).reshape(1, 1, 5), + np.arange(25).reshape(1, 5, 5), + np.arange(25).reshape(5, 1, 5), + np.arange(25).reshape(5, 5, 1), + np.arange(125).reshape(5, 5, 5), ] - for chunksize in self.chunk_sizes: - cf.chunksize(chunksize) - for a in A: - for b in A: - d = cf.Data(a) - e = cf.Data(b) - ab = a * b - de = d * e - self.assertEqual(de.shape, ab.shape) - self.assertTrue((de.array == ab).all()) - # --- End: for - - cf.chunksize(self.original_chunksize) + for a in A: + for b in A: + d = cf.Data(a) + e = cf.Data(b) + ab = a * b + de = d * e + self.assertEqual(de.shape, ab.shape) + self.assertTrue((de.array == ab).all()) def test_Data_ERROR(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: @@ -1999,6 +1983,7 @@ def test_Data_ERROR(self): cf.Data.mask_fpe(oldm) cf.Data.seterr(**olds) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_shape'") def test_Data__len__(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -2058,32 +2043,28 @@ def test_Data__round__(self): with self.assertRaises(Exception): _ = round(cf.Data([1, 2])) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_argmax(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): + d = cf.Data(np.arange(1200).reshape(40, 5, 6)) - d = cf.Data(numpy.arange(1200).reshape(40, 5, 6)) + self.assertEqual(d.argmax(), 1199) + self.assertEqual(d.argmax(unravel=True), (39, 4, 5)) - self.assertEqual(d.argmax(), 1199) - self.assertEqual(d.argmax(unravel=True), (39, 4, 5)) - - e = d.argmax(axis=1) - self.assertEqual(e.shape, (40, 6)) - self.assertTrue( - e.equals( - cf.Data.full(shape=(40, 6), fill_value=4, dtype=int) - ) - ) + e = d.argmax(axis=1) + self.assertEqual(e.shape, (40, 6)) + self.assertTrue( + e.equals(cf.Data.full(shape=(40, 6), fill_value=4, dtype=int)) + ) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "hits 'NoneType' is not iterable") def test_Data__collapse_SHAPE(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - a = numpy.arange(-100, 200.0, dtype=float).reshape(3, 4, 5, 5) - _ = numpy.ones(a.shape, dtype=float) + a = np.arange(-100, 200.0, dtype=float).reshape(3, 4, 5, 5) for h in ( "sample_size", @@ -2169,919 +2150,835 @@ def test_Data__collapse_SHAPE(self): ) # --- End: for + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_max_min_sum_sum_of_squares(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - for pp in (True, False): - cf.chunksize(chunksize) - - # unweighted, unmasked - d = cf.Data(self.a) - for np, h in zip( - (numpy.sum, numpy.amin, numpy.amax, numpy.sum), - ("sum", "min", "max", "sum_of_squares"), - ): - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - if h == "sum_of_squares": - b = b ** 2 - - b = np(b, axis=-1) - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma) - for np, h in zip( - (numpy.ma.sum, numpy.ma.amin, numpy.ma.amax, numpy.ma.sum), - ("sum", "min", "max", "sum_of_squares"), - ): - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - if h == "sum_of_squares": - b = b ** 2 + for pp in (True, False): + # unweighted, unmasked + d = cf.Data(self.a) + for _np, h in zip( + (np.sum, np.amin, np.amax, np.sum), + ("sum", "min", "max", "sum_of_squares"), + ): + for axes in self.axes_combinations: + b = reshape_array(self.a, axes) + if h == "sum_of_squares": + b = b ** 2 - b = np(b, axis=-1) - b = numpy.ma.asanyarray(b) - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) + b = _np(b, axis=-1) + e = getattr(d, h)( + axes=axes, squeeze=True, _preserve_partitions=pp + ) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "{}, axis={}, unweighted, unmasked " + "\ne={}, \nb={}".format(h, axes, e.array, b), + ) - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, \nb.mask={}".format( - h, axes, e.mask.array, b.mask - ), - ) + # unweighted, masked + d = cf.Data(self.ma) + for _np, h in zip( + (np.ma.sum, np.ma.amin, np.ma.amax, np.ma.sum), + ("sum", "min", "max", "sum_of_squares"), + ): + for axes in self.axes_combinations: + b = reshape_array(self.ma, axes) + if h == "sum_of_squares": + b = b ** 2 + + b = _np(b, axis=-1) + b = np.ma.asanyarray(b) + e = getattr(d, h)( + axes=axes, squeeze=True, _preserve_partitions=pp + ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) - # --- End: for + self.assertTrue( + (e.mask.array == b.mask).all(), + "{}, axis={}, \ne.mask={}, \nb.mask={}".format( + h, axes, e.mask.array, b.mask + ), + ) - cf.chunksize(self.original_chunksize) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "{}, axis={}, unweighted, masked " + "\ne={}, \nb={}".format(h, axes, e.array, b), + ) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_median(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - b = numpy.median(b, axis=-1) + for pp in (True, False): + # unweighted, unmasked + d = cf.Data(self.a) + for axes in self.axes_combinations: + b = reshape_array(self.a, axes) + b = np.median(b, axis=-1) - e = d.median( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "median, axis={}, unweighted, unmasked " - "\ne={}, \nb={}".format(axes, e.array, b), - ) + e = d.median(axes=axes, squeeze=True, _preserve_partitions=pp) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "median, axis={}, unweighted, unmasked " + "\ne={}, \nb={}".format(axes, e.array, b), + ) - # unweighted, masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - b = numpy.ma.filled(b, numpy.nan) - with numpy.testing.suppress_warnings() as sup: - sup.filter( - RuntimeWarning, - message=".*All-NaN slice encountered", - ) - b = numpy.nanpercentile(b, 50, axis=-1) - - b = numpy.ma.masked_where( - numpy.isnan(b), b, copy=False - ) - b = numpy.ma.asanyarray(b) + # unweighted, masked + d = cf.Data(self.ma) + for axes in self.axes_combinations: + b = reshape_array(self.ma, axes) + b = np.ma.filled(b, np.nan) + with np.testing.suppress_warnings() as sup: + sup.filter( + RuntimeWarning, message=".*All-NaN slice encountered" + ) + b = np.nanpercentile(b, 50, axis=-1) - e = d.median( - axes=axes, squeeze=True, _preserve_partitions=pp - ) + b = np.ma.masked_where(np.isnan(b), b, copy=False) + b = np.ma.asanyarray(b) - self.assertTrue( - (e.mask.array == b.mask).all(), - "median, axis={}, \ne.mask={}, " - "\nb.mask={}".format(axes, e.mask.array, b.mask), - ) + e = d.median(axes=axes, squeeze=True, _preserve_partitions=pp) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "median, axis={}, unweighted, masked " - "\ne={}, \nb={}".format(axes, e.array, b), - ) + self.assertTrue( + (e.mask.array == b.mask).all(), + "median, axis={}, \ne.mask={}, " + "\nb.mask={}".format(axes, e.mask.array, b.mask), + ) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "median, axis={}, unweighted, masked " + "\ne={}, \nb={}".format(axes, e.array, b), + ) + + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_pmndim'") def test_Data_percentile(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(self.a) - - # Percentiles taken across *all axes* - ranks = [[30, 60, 90], [20], 80] # include valid singular form + d = cf.Data(self.a) - for rank in ranks: - # Note: in cf the default is squeeze=False, but - # numpy has an inverse parameter called keepdims - # which is by default False also, one must be set - # to the non-default for equivalents. So first - # cases (n1, n1) are both squeezed, (n2, n2) are - # not: - a1 = numpy.percentile(d, rank) # keepdims=False default - b1 = d.percentile(rank, squeeze=True) - self.assertTrue(b1.allclose(a1, rtol=1e-05, atol=1e-08)) - a2 = numpy.percentile(d, rank, keepdims=True) - b2 = d.percentile(rank) # squeeze=False default - self.assertTrue(b2.shape, a2.shape) - self.assertTrue(b2.allclose(a2, rtol=1e-05, atol=1e-08)) + # Percentiles taken across *all axes* + ranks = [[30, 60, 90], [20], 80] # include valid singular form + + for rank in ranks: + # Note: in cf the default is squeeze=False, but + # numpy has an inverse parameter called keepdims + # which is by default False also, one must be set + # to the non-default for equivalents. So first + # cases (n1, n1) are both squeezed, (n2, n2) are + # not: + a1 = np.percentile(d, rank) # keepdims=False default + b1 = d.percentile(rank, squeeze=True) + self.assertTrue(b1.allclose(a1, rtol=1e-05, atol=1e-08)) + a2 = np.percentile(d, rank, keepdims=True) + b2 = d.percentile(rank) # squeeze=False default + self.assertTrue(b2.shape, a2.shape) + self.assertTrue(b2.allclose(a2, rtol=1e-05, atol=1e-08)) # TODO: add loop to check get same shape and close enough data # for every possible axes combo (as with test_Data_median above). + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_mean_of_upper_decile(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - p = numpy.percentile(b, 90, axis=-1, keepdims=True) - b = numpy.ma.where(b < p, numpy.ma.masked, b) - b = numpy.average(b, axis=-1) + for pp in (True, False): + # unweighted, unmasked + d = cf.Data(self.a) + for axes in self.axes_combinations: + b = reshape_array(self.a, axes) + p = np.percentile(b, 90, axis=-1, keepdims=True) + b = np.ma.where(b < p, np.ma.masked, b) + b = np.average(b, axis=-1) - e = d.mean_of_upper_decile( - axes=axes, squeeze=True, _preserve_partitions=pp - ) + e = d.mean_of_upper_decile( + axes=axes, squeeze=True, _preserve_partitions=pp + ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "mean_of_upper_decile, axis={}, unweighted, " - "unmasked \ne={}, \nb={}".format(axes, e.array, b), - ) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "mean_of_upper_decile, axis={}, unweighted, " + "unmasked \ne={}, \nb={}".format(axes, e.array, b), + ) - # unweighted, masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - b = numpy.ma.filled(b, numpy.nan) - with numpy.testing.suppress_warnings() as sup: - sup.filter( - RuntimeWarning, - message=".*All-NaN slice encountered", - ) - p = numpy.nanpercentile( - b, 90, axis=-1, keepdims=True - ) - - b = numpy.ma.masked_where( - numpy.isnan(b), b, copy=False - ) + # unweighted, masked + d = cf.Data(self.ma) + for axes in self.axes_combinations: + b = reshape_array(self.ma, axes) + b = np.ma.filled(b, np.nan) + with np.testing.suppress_warnings() as sup: + sup.filter( + RuntimeWarning, message=".*All-NaN slice encountered" + ) + p = np.nanpercentile(b, 90, axis=-1, keepdims=True) - p = numpy.where(numpy.isnan(p), b.max() + 1, p) + b = np.ma.masked_where(np.isnan(b), b, copy=False) - with numpy.testing.suppress_warnings() as sup: - sup.filter( - RuntimeWarning, - message=".*invalid value encountered in less", - ) - b = numpy.ma.where(b < p, numpy.ma.masked, b) + p = np.where(np.isnan(p), b.max() + 1, p) - b = numpy.ma.average(b, axis=-1) - b = numpy.ma.asanyarray(b) + with np.testing.suppress_warnings() as sup: + sup.filter( + RuntimeWarning, + message=".*invalid value encountered in less", + ) + b = np.ma.where(b < p, np.ma.masked, b) - e = d.mean_of_upper_decile( - axes=axes, squeeze=True, _preserve_partitions=pp - ) + b = np.ma.average(b, axis=-1) + b = np.ma.asanyarray(b) - self.assertTrue( - (e.mask.array == b.mask).all(), - "mean_of_upper_decile, axis={}, \ne.mask={}, " - "\nb.mask={}".format(axes, e.mask.array, b.mask), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "mean_of_upper_decile, axis={}, " - "unweighted, masked " - "\ne={}, \nb={}".format(axes, e.array, b), - ) + e = d.mean_of_upper_decile( + axes=axes, squeeze=True, _preserve_partitions=pp + ) + + self.assertTrue( + (e.mask.array == b.mask).all(), + "mean_of_upper_decile, axis={}, \ne.mask={}, " + "\nb.mask={}".format(axes, e.mask.array, b.mask), + ) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "mean_of_upper_decile, axis={}, " + "unweighted, masked " + "\ne={}, \nb={}".format(axes, e.array, b), + ) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_range_mid_range(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for h in ("range", "mid_range"): - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - mn = numpy.amin(b, axis=-1) - mx = numpy.amax(b, axis=-1) - if h == "range": - b = mx - mn - elif h == "mid_range": - b = (mx + mn) * 0.5 - - e = getattr(d, h)( - axes=axes, - squeeze=True, - _preserve_partitions=pp, - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma) - for h in ("range", "mid_range"): - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - mn = numpy.amin(b, axis=-1) - mx = numpy.amax(b, axis=-1) - if h == "range": - b = mx - mn - elif h == "mid_range": - b = (mx + mn) * 0.5 - - b = numpy.ma.asanyarray(b) - - e = getattr(d, h)( - axes=axes, - squeeze=True, - _preserve_partitions=pp, - ) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, " - "\nb.mask={}".format( - h, axes, e.mask.array, b.mask - ), - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) + for pp in (True, False): + # unweighted, unmasked + d = cf.Data(self.a) + for h in ("range", "mid_range"): + for axes in self.axes_combinations: + b = reshape_array(self.a, axes) + mn = np.amin(b, axis=-1) + mx = np.amax(b, axis=-1) + if h == "range": + b = mx - mn + elif h == "mid_range": + b = (mx + mn) * 0.5 + + e = getattr(d, h)( + axes=axes, squeeze=True, _preserve_partitions=pp + ) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "{}, axis={}, unweighted, unmasked " + "\ne={}, \nb={}".format(h, axes, e.array, b), + ) + # --- End: for + # unweighted, masked + d = cf.Data(self.ma) + for h in ("range", "mid_range"): + for axes in self.axes_combinations: + b = reshape_array(self.ma, axes) + mn = np.amin(b, axis=-1) + mx = np.amax(b, axis=-1) + if h == "range": + b = mx - mn + elif h == "mid_range": + b = (mx + mn) * 0.5 + + b = np.ma.asanyarray(b) + + e = getattr(d, h)( + axes=axes, squeeze=True, _preserve_partitions=pp + ) + + self.assertTrue( + (e.mask.array == b.mask).all(), + "{}, axis={}, \ne.mask={}, " + "\nb.mask={}".format(h, axes, e.mask.array, b.mask), + ) + + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "{}, axis={}, unweighted, masked " + "\ne={}, \nb={}".format(h, axes, e.array, b), + ) + + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute 'w' for DataTest") def test_Data_integral(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - for pp in (True, False): - # unmasked - d = cf.Data(self.a) - x = cf.Data(self.w) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - v = reshape_array(self.w, axes) - b = numpy.sum(b * v, axis=-1) + for pp in (True, False): + # unmasked + d = cf.Data(self.a) + x = cf.Data(self.w) + for axes in self.axes_combinations: + b = reshape_array(self.a, axes) + v = reshape_array(self.w, axes) + b = np.sum(b * v, axis=-1) - e = d.integral( - axes=axes, - squeeze=True, - weights=x, - _preserve_partitions=pp, - ) + e = d.integral( + axes=axes, squeeze=True, weights=x, _preserve_partitions=pp + ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, unmasked \ne={}, \nb={}".format( - axes, e.array, b - ), - ) - # --- End: for + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "axis={}, unmasked \ne={}, \nb={}".format( + axes, e.array, b + ), + ) + # --- End: for - # masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - v = reshape_array(self.w, axes) - b = numpy.sum(b * v, axis=-1) - b = numpy.ma.asanyarray(b) + # masked + d = cf.Data(self.ma) + for axes in self.axes_combinations: + b = reshape_array(self.ma, axes) + v = reshape_array(self.w, axes) + b = np.sum(b * v, axis=-1) + b = np.ma.asanyarray(b) - e = d.integral( - axes=axes, - squeeze=True, - weights=x, - _preserve_partitions=pp, - ) + e = d.integral( + axes=axes, squeeze=True, weights=x, _preserve_partitions=pp + ) - self.assertTrue( - (e.mask.array == b.mask).all(), - "axis={} masked, \ne.mask={}, " - "\nb.mask={}".format(axes, e.mask.array, b.mask), - ) + self.assertTrue( + (e.mask.array == b.mask).all(), + "axis={} masked, \ne.mask={}, " + "\nb.mask={}".format(axes, e.mask.array, b.mask), + ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, masked \ne={}, \nb={}".format( - axes, e.array, b - ), - ) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "axis={}, masked \ne={}, \nb={}".format(axes, e.array, b), + ) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_sum_of_weights_sum_of_weights2(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for h in ("sum_of_weights", "sum_of_weights2"): - for axes in self.axes_combinations: - b = reshape_array(self.ones, axes) - b = b.sum(axis=-1) - e = getattr(d, h)( - axes=axes, - squeeze=True, - _preserve_partitions=pp, - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked, pp={}, " - "\ne={}, \nb={}".format( - h, axes, pp, e.array, b - ), - ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma) - for a, h in zip( - (self.mones, self.mones), - ("sum_of_weights", "sum_of_weights2"), - ): - for axes in self.axes_combinations: - b = reshape_array(a, axes) - b = numpy.ma.asanyarray(b.sum(axis=-1)) - e = getattr(d, h)( - axes=axes, - squeeze=True, - _preserve_partitions=pp, - ) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, unweighted, masked, pp={}, " - "\ne.mask={}, \nb.mask={}".format( - h, axes, pp, e.mask.array, b.mask - ), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked, pp={}, " - "\ne={}, \nb={}".format( - h, axes, pp, e.array, b - ), - ) - # --- End: for - - # weighted, masked - d = cf.Data(self.ma) - x = cf.Data(self.w) - for a, h in zip( - (self.mw, self.mw * self.mw), - ("sum_of_weights", "sum_of_weights2"), - ): - for axes in self.axes_combinations: - a = a.copy() - a.mask = self.ma.mask - b = reshape_array(a, axes) - b = numpy.ma.asanyarray(b.sum(axis=-1)) - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - _preserve_partitions=pp, - ) - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, " - "\nb.mask={}".format( - h, - axes, - e.mask.array, - b.mask, - ), - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, \ne={}, \nb={}".format( - h, axes, e.array, b - ), - ) - # --- End: for - - # weighted, unmasked - d = cf.Data(self.a) - for a, h in zip( - (self.w, self.w * self.w), - ("sum_of_weights", "sum_of_weights2"), - ): - for axes in self.axes_combinations: - b = reshape_array(a, axes) - b = b.sum(axis=-1) - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - _preserve_partitions=pp, - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, \ne={}, \nb={}".format( - h, axes, e.array, b - ), - ) - - def test_Data_mean_mean_absolute_value(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for absolute in (False, True): - a = self.a - ma = self.ma - method = "mean" - if absolute: - a = numpy.absolute(a) - ma = numpy.absolute(ma) - method = "mean_absolute_value" - - for chunksize in self.chunk_sizes: - cf.chunksize(chunksize) - - # unweighted, unmasked - d = cf.Data(self.a) + for pp in (True, False): + # unweighted, unmasked + d = cf.Data(self.a) + for h in ("sum_of_weights", "sum_of_weights2"): for axes in self.axes_combinations: - b = reshape_array(a, axes) - b = numpy.mean(b, axis=-1) - e = getattr(d, method)(axes=axes, squeeze=True) + b = reshape_array(self.ones, axes) + b = b.sum(axis=-1) + e = getattr(d, h)( + axes=axes, squeeze=True, _preserve_partitions=pp + ) self.assertTrue( e.allclose(b, rtol=1e-05, atol=1e-08), - "{} axis={}, unweighted, unmasked \ne={}, " - "\nb={}".format(method, axes, e.array, b), + "{}, axis={}, unweighted, unmasked, pp={}, " + "\ne={}, \nb={}".format(h, axes, pp, e.array, b), ) - # --- End: for + # --- End: for - # weighted, unmasked - x = cf.Data(self.w) + # unweighted, masked + d = cf.Data(self.ma) + for a, h in zip( + (self.mones, self.mones), ("sum_of_weights", "sum_of_weights2") + ): for axes in self.axes_combinations: b = reshape_array(a, axes) - v = reshape_array(self.w, axes) - b = numpy.average(b, axis=-1, weights=v) - - e = getattr(d, method)(axes=axes, weights=x, squeeze=True) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{} weighted, unmasked axis={}, \ne={}, " - "\nb={}".format(method, axes, e.array, b), + b = np.ma.asanyarray(b.sum(axis=-1)) + e = getattr(d, h)( + axes=axes, squeeze=True, _preserve_partitions=pp ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(ma, axes) - b = numpy.ma.average(b, axis=-1) - b = numpy.ma.asanyarray(b) - - e = getattr(d, method)(axes=axes, squeeze=True) self.assertTrue( (e.mask.array == b.mask).all(), - "{} unweighted, masked axis={}, \ne.mask={}, " - "\nb.mask={}".format( - method, axes, e.mask.array, b.mask + "{}, axis={}, unweighted, masked, pp={}, " + "\ne.mask={}, \nb.mask={}".format( + h, axes, pp, e.mask.array, b.mask ), ) self.assertTrue( e.allclose(b, rtol=1e-05, atol=1e-08), - "{} unweighted, masked axis={}, \ne={}, " - "\nb={}, ".format(method, axes, e.array, b), + "{}, axis={}, unweighted, masked, pp={}, " + "\ne={}, \nb={}".format(h, axes, pp, e.array, b), ) - # --- End: for + # --- End: for - # weighted, masked + # weighted, masked + d = cf.Data(self.ma) + x = cf.Data(self.w) + for a, h in zip( + (self.mw, self.mw * self.mw), + ("sum_of_weights", "sum_of_weights2"), + ): for axes in self.axes_combinations: - b = reshape_array(ma, axes) - v = reshape_array(self.mw, axes) - b = numpy.ma.average(b, axis=-1, weights=v) - b = numpy.ma.asanyarray(b) - - e = getattr(d, method)(axes=axes, weights=x, squeeze=True) - + a = a.copy() + a.mask = self.ma.mask + b = reshape_array(a, axes) + b = np.ma.asanyarray(b.sum(axis=-1)) + e = getattr(d, h)( + axes=axes, + weights=x, + squeeze=True, + _preserve_partitions=pp, + ) self.assertTrue( (e.mask.array == b.mask).all(), - "{} weighted, masked axis={}, \ne.mask={}, " - "\nb.mask={}".format( - method, axes, e.mask.array, b.mask + "{}, axis={}, \ne.mask={}, " + "\nb.mask={}".format(h, axes, e.mask.array, b.mask), + ) + + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "{}, axis={}, \ne={}, \nb={}".format( + h, axes, e.array, b ), ) + # --- End: for + # weighted, unmasked + d = cf.Data(self.a) + for a, h in zip( + (self.w, self.w * self.w), + ("sum_of_weights", "sum_of_weights2"), + ): + for axes in self.axes_combinations: + b = reshape_array(a, axes) + b = b.sum(axis=-1) + e = getattr(d, h)( + axes=axes, + weights=x, + squeeze=True, + _preserve_partitions=pp, + ) self.assertTrue( e.allclose(b, rtol=1e-05, atol=1e-08), - "{} weighted, masked axis={}, \ne={}, " - "\nb={}, ".format(method, axes, e.array, b), + "{}, axis={}, \ne={}, \nb={}".format( + h, axes, e.array, b + ), ) - # --- End: for - cf.chunksize(self.original_chunksize) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") + def test_Data_mean_mean_absolute_value(self): + if self.test_only and inspect.stack()[0][3] not in self.test_only: + return + for absolute in (False, True): + a = self.a + ma = self.ma + method = "mean" + if absolute: + a = np.absolute(a) + ma = np.absolute(ma) + method = "mean_absolute_value" + + # unweighted, unmasked + d = cf.Data(self.a) + for axes in self.axes_combinations: + b = reshape_array(a, axes) + b = np.mean(b, axis=-1) + e = getattr(d, method)(axes=axes, squeeze=True) + + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "{} axis={}, unweighted, unmasked \ne={}, " + "\nb={}".format(method, axes, e.array, b), + ) + # --- End: for + + # weighted, unmasked + x = cf.Data(self.w) + for axes in self.axes_combinations: + b = reshape_array(a, axes) + v = reshape_array(self.w, axes) + b = np.average(b, axis=-1, weights=v) + + e = getattr(d, method)(axes=axes, weights=x, squeeze=True) + + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "{} weighted, unmasked axis={}, \ne={}, " + "\nb={}".format(method, axes, e.array, b), + ) + # --- End: for + + # unweighted, masked + d = cf.Data(self.ma) + for axes in self.axes_combinations: + b = reshape_array(ma, axes) + b = np.ma.average(b, axis=-1) + b = np.ma.asanyarray(b) + + e = getattr(d, method)(axes=axes, squeeze=True) + + self.assertTrue( + (e.mask.array == b.mask).all(), + "{} unweighted, masked axis={}, \ne.mask={}, " + "\nb.mask={}".format(method, axes, e.mask.array, b.mask), + ) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "{} unweighted, masked axis={}, \ne={}, " + "\nb={}, ".format(method, axes, e.array, b), + ) + # --- End: for + + # weighted, masked + for axes in self.axes_combinations: + b = reshape_array(ma, axes) + v = reshape_array(self.mw, axes) + b = np.ma.average(b, axis=-1, weights=v) + b = np.ma.asanyarray(b) + + e = getattr(d, method)(axes=axes, weights=x, squeeze=True) + + self.assertTrue( + (e.mask.array == b.mask).all(), + "{} weighted, masked axis={}, \ne.mask={}, " + "\nb.mask={}".format(method, axes, e.mask.array, b.mask), + ) + + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "{} weighted, masked axis={}, \ne={}, " + "\nb={}, ".format(method, axes, e.array, b), + ) + # --- End: for + + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_root_mean_square(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - # unweighted, unmasked - d = cf.Data(self.a) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) ** 2 - b = numpy.mean(b, axis=-1) ** 0.5 - e = d.root_mean_square(axes=axes, squeeze=True) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, unweighted, unmasked \ne={}, " - "\nb={}".format(axes, e.array, b), - ) - # --- End: for + # unweighted, unmasked + d = cf.Data(self.a) + for axes in self.axes_combinations: + b = reshape_array(self.a, axes) ** 2 + b = np.mean(b, axis=-1) ** 0.5 + e = d.root_mean_square(axes=axes, squeeze=True) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "axis={}, unweighted, unmasked \ne={}, " + "\nb={}".format(axes, e.array, b), + ) + # --- End: for - # weighted, unmasked - x = cf.Data(self.w) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) ** 2 - v = reshape_array(self.w, axes) - b = numpy.average(b, axis=-1, weights=v) ** 0.5 + # weighted, unmasked + x = cf.Data(self.w) + for axes in self.axes_combinations: + b = reshape_array(self.a, axes) ** 2 + v = reshape_array(self.w, axes) + b = np.average(b, axis=-1, weights=v) ** 0.5 - e = d.root_mean_square(axes=axes, weights=x, squeeze=True) + e = d.root_mean_square(axes=axes, weights=x, squeeze=True) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, weighted, unmasked \ne={}, " - "\nb={}".format(axes, e.array, b), - ) - # --- End: for + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "axis={}, weighted, unmasked \ne={}, " + "\nb={}".format(axes, e.array, b), + ) + # --- End: for - # unweighted, masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) ** 2 - b = numpy.ma.average(b, axis=-1) - b = numpy.ma.asanyarray(b) ** 0.5 + # unweighted, masked + d = cf.Data(self.ma) + for axes in self.axes_combinations: + b = reshape_array(self.ma, axes) ** 2 + b = np.ma.average(b, axis=-1) + b = np.ma.asanyarray(b) ** 0.5 - e = d.root_mean_square(axes=axes, squeeze=True) + e = d.root_mean_square(axes=axes, squeeze=True) - self.assertTrue( - (e.mask.array == b.mask).all(), - "axis={}, unweighted, masked \ne.mask={}, " - "\nb.mask={}, ".format(axes, e.mask.array, b.mask), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, unweighted, masked \ne={}, " - "\nb={}, ".format(axes, e.array, b), - ) - # --- End: for + self.assertTrue( + (e.mask.array == b.mask).all(), + "axis={}, unweighted, masked \ne.mask={}, " + "\nb.mask={}, ".format(axes, e.mask.array, b.mask), + ) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "axis={}, unweighted, masked \ne={}, " + "\nb={}, ".format(axes, e.array, b), + ) + # --- End: for - # weighted, masked - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) ** 2 - v = reshape_array(self.mw, axes) - b = numpy.ma.average(b, axis=-1, weights=v) - b = numpy.ma.asanyarray(b) ** 0.5 + # weighted, masked + for axes in self.axes_combinations: + b = reshape_array(self.ma, axes) ** 2 + v = reshape_array(self.mw, axes) + b = np.ma.average(b, axis=-1, weights=v) + b = np.ma.asanyarray(b) ** 0.5 - e = d.root_mean_square(axes=axes, weights=x, squeeze=True) + e = d.root_mean_square(axes=axes, weights=x, squeeze=True) - self.assertTrue( - (e.mask.array == b.mask).all(), - "axis={}, weighted, masked \ne.mask={}, " - "\nb.mask={}, ".format(axes, e.mask.array, b.mask), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, weighted, masked \ne={}, \nb={}, ".format( - axes, e.array, b - ), - ) + self.assertTrue( + (e.mask.array == b.mask).all(), + "axis={}, weighted, masked \ne.mask={}, " + "\nb.mask={}, ".format(axes, e.mask.array, b.mask), + ) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "axis={}, weighted, masked \ne={}, \nb={}, ".format( + axes, e.array, b + ), + ) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_sample_size(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - # unmasked - d = cf.Data(self.a) - for axes in self.axes_combinations: - b = reshape_array(self.ones, axes) - b = b.sum(axis=-1) - e = d.sample_size(axes=axes, squeeze=True) + # unmasked + d = cf.Data(self.a) + for axes in self.axes_combinations: + b = reshape_array(self.ones, axes) + b = b.sum(axis=-1) + e = d.sample_size(axes=axes, squeeze=True) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, \ne={}, \nb={}".format(axes, e.array, b), - ) - # --- End: for + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "axis={}, \ne={}, \nb={}".format(axes, e.array, b), + ) + # --- End: for - # masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.mones, axes) - b = b.sum(axis=-1) - e = d.sample_size(axes=axes, squeeze=True) + # masked + d = cf.Data(self.ma) + for axes in self.axes_combinations: + b = reshape_array(self.mones, axes) + b = b.sum(axis=-1) + e = d.sample_size(axes=axes, squeeze=True) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, \ne={}, \nb={}".format(axes, e.array, b), - ) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "axis={}, \ne={}, \nb={}".format(axes, e.array, b), + ) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'axes_combinations'") def test_Data_sd_var(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return ddofs = (0, 1) - for chunksize in self.chunk_sizes: - cf.chunksize(chunksize) - for pp in (False, True): - # unweighted, unmasked - d = cf.Data(self.a, units="K") - for np, h in zip((numpy.var, numpy.std), ("var", "sd")): - for ddof in ddofs: - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - b = np(b, axis=-1, ddof=ddof) - e = getattr(d, h)( - axes=axes, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked pp={}, " - "\ne={}, \nb={}".format( - h, axes, pp, e.array, b - ), - ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma, units="K") - for np, h in zip((numpy.ma.var, numpy.ma.std), ("var", "sd")): - for ddof in ddofs: - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - b = np(b, axis=-1, ddof=ddof) - e = getattr(d, h)( - axes=axes, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked, pp={}, " - "\ne={}, \nb={}".format( - h, axes, pp, e.array, b - ), - ) - # --- End: for - - # weighted, unmasked - d = cf.Data(self.a, units="K") - x = cf.Data(self.w) - for h in ("var", "sd"): + for pp in (False, True): + # unweighted, unmasked + d = cf.Data(self.a, units="K") + for _np, h in zip((np.var, np.std), ("var", "sd")): + for ddof in ddofs: for axes in self.axes_combinations: - for ddof in (0, 1): - b = reshape_array(self.a, axes) - v = reshape_array(self.w, axes) - - avg = numpy.average(b, axis=-1, weights=v) - if numpy.ndim(avg) < b.ndim: - avg = numpy.expand_dims(avg, -1) - - b, V1 = numpy.average( - (b - avg) ** 2, - axis=-1, - weights=v, - returned=True, - ) - - if ddof == 1: - # Calculate the weighted unbiased - # variance. The unbiased variance - # weighted with _reliability_ weights - # is [V1**2/(V1**2-V2)]*var. - V2 = numpy.asanyarray((v * v).sum(axis=-1)) - b *= V1 * V1 / (V1 * V1 - V2) - elif ddof == 0: - pass - - if h == "sd": - b **= 0.5 - - b = numpy.ma.asanyarray(b) - - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, weighted, unmasked, pp={}, " - "ddof={}, \ne={}, \nb={}".format( - h, axes, pp, ddof, e.array, b - ), - ) - # --- End: for - - # weighted, masked - d = cf.Data(self.ma, units="K") - x = cf.Data(self.w) - for h in ("var", "sd"): + b = reshape_array(self.a, axes) + b = _np(b, axis=-1, ddof=ddof) + e = getattr(d, h)( + axes=axes, + squeeze=True, + ddof=ddof, + _preserve_partitions=pp, + ) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "{}, axis={}, unweighted, unmasked pp={}, " + "\ne={}, \nb={}".format(h, axes, pp, e.array, b), + ) + # --- End: for + + # unweighted, masked + d = cf.Data(self.ma, units="K") + for _np, h in zip((np.ma.var, np.ma.std), ("var", "sd")): + for ddof in ddofs: for axes in self.axes_combinations: - for ddof in (0, 1): - b = reshape_array(self.ma, axes) - v = reshape_array(self.mw, axes) - - not_enough_data = ( - numpy.ma.count(b, axis=-1) <= ddof - ) - - avg = numpy.ma.average(b, axis=-1, weights=v) - if numpy.ndim(avg) < b.ndim: - avg = numpy.expand_dims(avg, -1) - - b, V1 = numpy.ma.average( - (b - avg) ** 2, - axis=-1, - weights=v, - returned=True, - ) - - b = numpy.ma.where( - not_enough_data, numpy.ma.masked, b - ) - - if ddof == 1: - # Calculate the weighted unbiased - # variance. The unbiased variance - # weighted with _reliability_ weights - # is [V1**2/(V1**2-V2)]*var. - V2 = numpy.asanyarray((v * v).sum(axis=-1)) - b *= V1 * V1 / (V1 * V1 - V2) - elif ddof == 0: - pass - - if h == "sd": - b **= 0.5 - - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - - if h == "sd": - self.assertEqual(e.Units, d.Units) - else: - self.assertEqual(e.Units, d.Units ** 2) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, " - "\nb.mask={}, ".format( - h, axes, e.mask.array, b.mask - ), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, weighted, masked, pp={}, " - "ddof={}, \ne={}, \nb={}".format( - h, axes, pp, ddof, e.array, b - ), - ) - # --- End: for + b = reshape_array(self.ma, axes) + b = _np(b, axis=-1, ddof=ddof) + e = getattr(d, h)( + axes=axes, + squeeze=True, + ddof=ddof, + _preserve_partitions=pp, + ) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "{}, axis={}, unweighted, masked, pp={}, " + "\ne={}, \nb={}".format(h, axes, pp, e.array, b), + ) + # --- End: for - cf.chunksize(self.original_chunksize) + # weighted, unmasked + d = cf.Data(self.a, units="K") + x = cf.Data(self.w) + for h in ("var", "sd"): + for axes in self.axes_combinations: + for ddof in (0, 1): + b = reshape_array(self.a, axes) + v = reshape_array(self.w, axes) + + avg = np.average(b, axis=-1, weights=v) + if np.ndim(avg) < b.ndim: + avg = np.expand_dims(avg, -1) + + b, V1 = np.average( + (b - avg) ** 2, axis=-1, weights=v, returned=True + ) + if ddof == 1: + # Calculate the weighted unbiased + # variance. The unbiased variance + # weighted with _reliability_ weights + # is [V1**2/(V1**2-V2)]*var. + V2 = np.asanyarray((v * v).sum(axis=-1)) + b *= V1 * V1 / (V1 * V1 - V2) + elif ddof == 0: + pass + + if h == "sd": + b **= 0.5 + + b = np.ma.asanyarray(b) + + e = getattr(d, h)( + axes=axes, + weights=x, + squeeze=True, + ddof=ddof, + _preserve_partitions=pp, + ) + + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "{}, axis={}, weighted, unmasked, pp={}, " + "ddof={}, \ne={}, \nb={}".format( + h, axes, pp, ddof, e.array, b + ), + ) + # --- End: for + + # weighted, masked + d = cf.Data(self.ma, units="K") + x = cf.Data(self.w) + for h in ("var", "sd"): + for axes in self.axes_combinations: + for ddof in (0, 1): + b = reshape_array(self.ma, axes) + v = reshape_array(self.mw, axes) + + not_enough_data = np.ma.count(b, axis=-1) <= ddof + + avg = np.ma.average(b, axis=-1, weights=v) + if np.ndim(avg) < b.ndim: + avg = np.expand_dims(avg, -1) + + b, V1 = np.ma.average( + (b - avg) ** 2, axis=-1, weights=v, returned=True + ) + + b = np.ma.where(not_enough_data, np.ma.masked, b) + + if ddof == 1: + # Calculate the weighted unbiased + # variance. The unbiased variance + # weighted with _reliability_ weights + # is [V1**2/(V1**2-V2)]*var. + V2 = np.asanyarray((v * v).sum(axis=-1)) + b *= V1 * V1 / (V1 * V1 - V2) + elif ddof == 0: + pass + + if h == "sd": + b **= 0.5 + + e = getattr(d, h)( + axes=axes, + weights=x, + squeeze=True, + ddof=ddof, + _preserve_partitions=pp, + ) + + if h == "sd": + self.assertEqual(e.Units, d.Units) + else: + self.assertEqual(e.Units, d.Units ** 2) + + self.assertTrue( + (e.mask.array == b.mask).all(), + "{}, axis={}, \ne.mask={}, " + "\nb.mask={}, ".format( + h, axes, e.mask.array, b.mask + ), + ) + self.assertTrue( + e.allclose(b, rtol=1e-05, atol=1e-08), + "{}, axis={}, weighted, masked, pp={}, " + "ddof={}, \ne={}, \nb={}".format( + h, axes, pp, ddof, e.array, b + ), + ) + # --- End: for + + @unittest.skipIf(TEST_DASKIFIED_ONLY, "hits unexpected kwarg 'select'") def test_Data_dumpd_loadd_dumps(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.read(self.filename)[0].data + d = cf.read(self.filename)[0].data - dumpd = d.dumpd() - self.assertTrue(d.equals(cf.Data(loadd=dumpd), verbose=2)) - self.assertTrue(d.equals(cf.Data(loadd=dumpd), verbose=2)) + dumpd = d.dumpd() + self.assertTrue(d.equals(cf.Data(loadd=dumpd), verbose=2)) + self.assertTrue(d.equals(cf.Data(loadd=dumpd), verbose=2)) - d.to_disk() - self.assertTrue(d.equals(cf.Data(loadd=dumpd), verbose=2)) + d.to_disk() + self.assertTrue(d.equals(cf.Data(loadd=dumpd), verbose=2)) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "hits unexpected kwarg 'select'") def test_Data_section(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in (300, 10000, 100000)[::-1]: - with cf.chunksize(chunksize): - f = cf.read(self.filename6)[0] - self.assertEqual( - list(sorted(f.data.section((1, 2)).keys())), - [(x, None, None) for x in range(1800)], - ) - d = cf.Data(numpy.arange(120).reshape(2, 3, 4, 5)) - x = d.section([1, 3]) - self.assertEqual(len(x), 8) - e = cf.Data.reconstruct_sectioned_data(x) - self.assertTrue(e.equals(d)) + f = cf.read(self.filename6)[0] + self.assertEqual( + list(sorted(f.data.section((1, 2)).keys())), + [(x, None, None) for x in range(1800)], + ) + d = cf.Data(np.arange(120).reshape(2, 3, 4, 5)) + x = d.section([1, 3]) + self.assertEqual(len(x), 8) + e = cf.Data.reconstruct_sectioned_data(x) + self.assertTrue(e.equals(d)) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_count(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return - for chunksize in (100000, 10000, 300): - with cf.chunksize(chunksize): - d = cf.Data(ma) - self.assertEqual(d.count(), 284, d.count()) - self.assertEqual( - d.count_masked(), d.size - 284, d.count_masked() - ) + d = cf.Data(ma) + self.assertEqual(d.count(), 284, d.count()) + self.assertEqual(d.count_masked(), d.size - 284, d.count_masked()) - d = cf.Data(a) - self.assertEqual(d.count(), d.size) - self.assertEqual(d.count_masked(), 0) + d = cf.Data(a) + self.assertEqual(d.count(), d.size) + self.assertEqual(d.count_masked(), 0) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_exp(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return for x in (1, -1): a = 0.9 * x * self.ma - c = numpy.ma.exp(a) - - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - d = cf.Data(a) - e = d.exp() - self.assertIsNone(d.exp(inplace=True)) - self.assertTrue(d.equals(e, verbose=2)) - self.assertEqual(d.shape, c.shape) - # The CI at one point gave a failure due to - # precision with: - # self.assertTrue((d.array==c).all()) so need a - # check which accounts for floating point calcs: - numpy.testing.assert_allclose(d.array, c) + c = np.ma.exp(a) + + d = cf.Data(a) + e = d.exp() + self.assertIsNone(d.exp(inplace=True)) + self.assertTrue(d.equals(e, verbose=2)) + self.assertEqual(d.shape, c.shape) + # The CI at one point gave a failure due to + # precision with: + # self.assertTrue((d.array==c).all()) so need a + # check which accounts for floating point calcs: + np.testing.assert_allclose(d.array, c) # --- End: for d = cf.Data(a, "m") with self.assertRaises(Exception): _ = d.exp() + @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_trigonometric_hyperbolic(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -3103,47 +3000,37 @@ def test_Data_trigonometric_hyperbolic(self): # apply some trig operation to convert it to valid range: if method.startswith("arc"): if method == "arccosh": # has unusual domain (x >= 1) - a = numpy.cosh(a.data) # convert non-masked x to >= 1 + a = np.cosh(a.data) # convert non-masked x to >= 1 else: # convert non-masked values x to range |x| < 1 - a = numpy.sin(a.data) - - c = getattr(numpy.ma, method)(a) - for chunksize in self.chunk_sizes: - with cf.chunksize(chunksize): - for units in (None, "", "1", "radians", "K"): - d = cf.Data(a, units=units) - # Suppress warnings that some values are - # invalid (NaN, +/- inf) or there is - # attempted division by zero, as this is - # expected with inverse trig: - with numpy.errstate( - invalid="ignore", divide="ignore" - ): - e = getattr(d, method)() - self.assertIsNone( - getattr(d, method)(inplace=True) - ) - - self.assertTrue( - d.equals(e, verbose=2), "{}".format(method) - ) - self.assertEqual(d.shape, c.shape) - self.assertTrue( - (d.array == c).all(), - "{}, {}, {}, {}".format( - method, units, d.array, c - ), - ) - self.assertTrue( - (d.mask.array == c.mask).all(), - "{}, {}, {}, {}".format( - method, units, d.array, c - ), - ) + a = np.sin(a.data) + + c = getattr(np.ma, method)(a) + for units in (None, "", "1", "radians", "K"): + d = cf.Data(a, units=units) + # Suppress warnings that some values are + # invalid (NaN, +/- inf) or there is + # attempted division by zero, as this is + # expected with inverse trig: + with np.errstate(invalid="ignore", divide="ignore"): + e = getattr(d, method)() + self.assertIsNone(getattr(d, method)(inplace=True)) + + self.assertTrue( + d.equals(e, verbose=2), "{}".format(method) + ) + self.assertEqual(d.shape, c.shape) + self.assertTrue( + (d.array == c).all(), + "{}, {}, {}, {}".format(method, units, d.array, c), + ) + self.assertTrue( + (d.mask.array == c.mask).all(), + "{}, {}, {}, {}".format(method, units, d.array, c), + ) # --- End: for # Also test masking behaviour: masking of invalid data occurs for - # numpy.ma module by default but we don't want that so there is logic + # np.ma module by default but we don't want that so there is logic # to workaround it. So check that invalid values do emerge. inverse_methods = [ method @@ -3153,7 +3040,7 @@ def test_Data_trigonometric_hyperbolic(self): d = cf.Data([2, 1.5, 1, 0.5, 0], mask=[1, 0, 0, 0, 1]) for method in inverse_methods: - with numpy.errstate(invalid="ignore", divide="ignore"): + with np.errstate(invalid="ignore", divide="ignore"): e = getattr(d, method)() self.assertTrue( (e.mask.array == d.mask.array).all(), @@ -3162,12 +3049,12 @@ def test_Data_trigonometric_hyperbolic(self): # In addition, test that 'nan', inf' and '-inf' emerge distinctly f = cf.Data([-2, -1, 1, 2], mask=[0, 0, 0, 1]) - with numpy.errstate(invalid="ignore", divide="ignore"): + with np.errstate(invalid="ignore", divide="ignore"): g = f.arctanh().array # expect [ nan, -inf, inf, --] - self.assertTrue(numpy.isnan(g[0])) - self.assertTrue(numpy.isneginf(g[1])) - self.assertTrue(numpy.isposinf(g[2])) + self.assertTrue(np.isnan(g[0])) + self.assertTrue(np.isneginf(g[1])) + self.assertTrue(np.isposinf(g[2])) self.assertIs(g[3], cf.masked) # AT2 @@ -3177,21 +3064,22 @@ def test_Data_trigonometric_hyperbolic(self): # a1 = 0.9 * x * self.ma # a2 = 0.5 * x * self.a # # Transform data for 'a' into range more appropriate for inverse: - # a1 = numpy.sin(a1.data) - # a2 = numpy.cos(a2.data) - - # c = numpy.ma.arctan2(a1, a2) - # for chunksize in self.chunk_sizes: - # cf.chunksize(chunksize) - # for units in (None, '', '1', 'radians', 'K'): - # d1 = cf.Data(a1, units=units) - # d2 = cf.Data(a2, units=units) - # e = cf.Data.arctan2(d1, d2) - # # Note: no inplace arg for arctan2 (operates on 2 arrays) - # self.assertEqual(d1.shape, c.shape) - # self.assertTrue((e.array == c).all()) - # self.assertTrue((d1.mask.array == c.mask).all()) - + # a1 = np.sin(a1.data) + # a2 = np.cos(a2.data) + + # c = np.ma.arctan2(a1, a2) + # for units in (None, '', '1', 'radians', 'K'): + # d1 = cf.Data(a1, units=units) + # d2 = cf.Data(a2, units=units) + # e = cf.Data.arctan2(d1, d2) + # # Note: no inplace arg for arctan2 (operates on 2 arrays) + # self.assertEqual(d1.shape, c.shape) + # self.assertTrue((e.array == c).all()) + # self.assertTrue((d1.mask.array == c.mask).all()) + + @unittest.skipIf( + TEST_DASKIFIED_ONLY, "hits 'TODODASK - use harden_mask/soften_mask'" + ) def test_Data_filled(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -3201,48 +3089,18 @@ def test_Data_filled(self): d[0, 0] = cf.masked self.assertTrue( - ( - d.filled().array - == [ - [ - -9223372036854775806, - 2, - 3, - ] - ] - ).all() + (d.filled().array == [[-9223372036854775806, 2, 3]]).all() ) d.set_fill_value(-99) - self.assertTrue( - ( - d.filled().array - == [ - [ - -99, - 2, - 3, - ] - ] - ).all() - ) + self.assertTrue((d.filled().array == [[-99, 2, 3]]).all()) - self.assertTrue( - ( - d.filled(1e10).array - == [ - [ - 1e10, - 2, - 3, - ] - ] - ).all() - ) + self.assertTrue((d.filled(1e10).array == [[1e10, 2, 3]]).all()) d = cf.Data(["a", "b", "c"], mask=[1, 0, 0]) self.assertTrue((d.filled().array == ["", "b", "c"]).all()) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "units-related problem") def test_Data_del_units(self): d = cf.Data(1) with self.assertRaises(ValueError): @@ -3280,6 +3138,7 @@ def test_Data_del_calendar(self): d = cf.Data(1, "days since 2000-1-1", calendar="noleap") self.assertTrue(d.del_calendar(), "noleap") + @unittest.skipIf(TEST_DASKIFIED_ONLY, "units-related problem") def test_Data_has_units(self): d = cf.Data(1) self.assertFalse(d.has_units()) @@ -3288,6 +3147,7 @@ def test_Data_has_units(self): d = cf.Data(1, "m") self.assertTrue(d.has_units()) + @unittest.skipIf(TEST_DASKIFIED_ONLY, "units-related problem") def test_Data_has_calendar(self): d = cf.Data(1) self.assertFalse(d.has_calendar()) @@ -3301,6 +3161,125 @@ def test_Data_has_calendar(self): d = cf.Data(1, "days since 2000-1-1", calendar="noleap") self.assertTrue(d.has_calendar()) + def test_Data_where(self): + a = np.arange(10) + d = cf.Data(a) + b = np.where(a < 5, a, 10 * a) + e = d.where(a < 5, d, 10 * a) + self.assertTrue(e.shape == b.shape) + self.assertTrue((e.array == b).all()) + + d = cf.Data(a, "km") + b = np.where(a < 5, 10 * a, a) + e = d.where(a < 5, cf.Data(10000 * a, "metre")) + self.assertTrue(e.shape == b.shape) + self.assertTrue((e.array == b).all()) + + a = np.array([[1, 2], [3, 4]]) + d = cf.Data(a) + b = np.where([[True, False], [True, True]], a, [[9, 8], [7, 6]]) + e = d.where([[True, False], [True, True]], d, [[9, 8], [7, 6]]) + self.assertTrue(e.shape == b.shape) + self.assertTrue((e.array == b).all()) + + b = np.where([[True, False], [True, True]], [[9, 8], [7, 6]], a) + e = d.where([[True, False], [True, True]], [[9, 8], [7, 6]]) + self.assertTrue(e.shape == b.shape) + self.assertTrue((e.array == b).all()) + + b = np.where([True, False], [9, 8], a) + e = d.where([True, False], [9, 8]) + self.assertTrue(e.shape == b.shape) + self.assertTrue((e.array == b).all()) + + a = np.array([[0, 1, 2], [0, 2, 4], [0, 3, 6]]) + d = cf.Data(a) + b = np.where(a < 4, a, -1) + e = d.where(a < 4, d, -1) + self.assertTrue(e.shape == b.shape) + self.assertTrue((e.array == b).all()) + + x, y = np.ogrid[:3, :4] + d = cf.Data(x) + with self.assertRaises(ValueError): + # Can't change shape + d.where(x < y, d, 10 + y) + + with self.assertRaises(ValueError): + # Can't change shape + d.where(False, d, 10 + y) + + a = np.ma.arange(9, dtype=int).reshape(3, 3) + d = cf.Data(a, mask=[[0, 0, 0], [1, 0, 0], [0, 0, 0]]) + e = d.where(a > 5, None, -999) + self.assertTrue(e.shape == d.shape) + self.assertTrue((e.array.mask == d.array.mask).all()) + self.assertTrue( + (e.array == [[-999, -999, -999], [5, -999, -999], [6, 7, 8]]).all() + ) + + d.soften_mask() + e = d.where(a > 5, None, -999) + self.assertTrue(e.shape == d.shape) + self.assertTrue((e.array.mask == False).all()) + self.assertTrue( + ( + e.array == [[-999, -999, -999], [-999, -999, -999], [6, 7, 8]] + ).all() + ) + + a = np.arange(10) + d = cf.Data(a) + e = d.where(a < 5, cf.masked) + self.assertTrue((e.array.mask == [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]).all()) + self.assertTrue((e.array == a).all()) + + def test_Data__init__compression(self): + if self.test_only and inspect.stack()[0][3] not in self.test_only: + return + + import cfdm + + # Ragged + for f in cfdm.read("DSG_timeSeries_contiguous.nc"): + f = f.data + d = cf.Data(cf.RaggedContiguousArray(source=f.source())) + self.assertTrue((d.array == f.array).all()) + # self.assertTrue(d.equals(f)) + + for f in cfdm.read("DSG_timeSeries_indexed.nc"): + f = f.data + d = cf.Data(cf.RaggedIndexedArray(source=f.source())) + self.assertTrue((d.array == f.array).all()) + # self.assertTrue(d.equals(f)) + + for f in cfdm.read("DSG_timeSeriesProfile_indexed_contiguous.nc"): + f = f.data + d = cf.Data(cf.RaggedIndexedContiguousArray(source=f.source())) + self.assertTrue((d.array == f.array).all()) + # self.assertTrue(d.equals(f)) + + # Ragged bounds + f = cfdm.read("DSG_timeSeriesProfile_indexed_contiguous.nc")[0] + f = f.construct("long_name=height above mean sea level").bounds.data + d = cf.Data(cf.RaggedIndexedContiguousArray(source=f.source())) + self.assertTrue((d.array == f.array).all()) + # self.assertTrue(d.equals(f)) + + # Gathered + for f in cfdm.read("gathered.nc"): + f = f.data + d = cf.Data(cf.GatheredArray(source=f.source())) + # self.assertTrue(d.equals(f)) + self.assertTrue((d.array == f.array).all()) + + # Subsampled + f = cfdm.read("subsampled_2.nc")[-3] + f = f.construct("longitude").data + d = cf.Data(cf.SubsampledArray(source=f.source())) + # self.assertTrue(d.equals(f)) + self.assertTrue((d.array == f.array).all()) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cf/test/test_Datetime.py b/cf/test/test_Datetime.py index 37f93ada0d..d5b487c87a 100644 --- a/cf/test/test_Datetime.py +++ b/cf/test/test_Datetime.py @@ -54,17 +54,11 @@ def test_Datetime_rt2dt(self): b = cf.cfdatetime.rt2dt([1, 3], Units("days since 2004-2-28")) self.assertTrue((a == b).all()) - for a in ( - np.ma.array(3), - np.ma.array([3]), - ): + for a in (np.ma.array(3), np.ma.array([3])): b = cf.cfdatetime.rt2dt(a, Units("days since 1970-01-01")) self.assertEqual(b, cf.dt(1970, 1, 4, calendar="gregorian")) - for a in ( - np.ma.array(3, mask=True), - np.ma.array([3], mask=True), - ): + for a in (np.ma.array(3, mask=True), np.ma.array([3], mask=True)): b = cf.cfdatetime.rt2dt(a, Units("days since 1970-01-01")) self.assertEqual(b.mask, True) diff --git a/cf/test/test_Field.py b/cf/test/test_Field.py index 168d5e15e5..6d4769d792 100644 --- a/cf/test/test_Field.py +++ b/cf/test/test_Field.py @@ -2419,11 +2419,7 @@ def test_Field_coordinate_reference(self): def test_Field_dimension_coordinate(self): f = self.f - for identity in ( - "grid_latitude", - "X", - "dimensioncoordinate1", - ): + for identity in ("grid_latitude", "X", "dimensioncoordinate1"): if identity == "X": key, c = f.construct("grid_longitude", item=True) else: diff --git a/cf/test/test_Query.py b/cf/test/test_Query.py index 92b6671cc4..105260c5ca 100644 --- a/cf/test/test_Query.py +++ b/cf/test/test_Query.py @@ -540,6 +540,32 @@ def test_Query_evaluate(self): self.assertNotEqual(x, cf.eq(re.compile(".*RTY$"))) self.assertNotEqual(x, cf.eq(re.compile("^.*RTY$"))) + def test_Query_set_condition_units(self): + q = cf.lt(9) + q.set_condition_units("km") + self.assertEqual(q.value.Units, cf.Units("km")) + + with self.assertRaises(ValueError): + q.set_condition_units("seconds") + + q = cf.lt(9000, units="m") + q.set_condition_units("km") + self.assertEqual(q.value.Units, cf.Units("km")) + self.assertEqual(q.value.array, 9) + + q = cf.lt(9) + r = cf.ge(3000, units="m") + s = q & r + s.set_condition_units("km") + self.assertEqual(s._compound[0].value.Units, cf.Units("km")) + self.assertEqual(s._compound[1].value.Units, cf.Units("km")) + self.assertEqual(s._compound[0].value.array, 9) + self.assertEqual(s._compound[1].value.array, 3) + + self.assertEqual(r.value.Units, cf.Units("m")) + self.assertEqual(r.value.array, 3000) + self.assertEqual(q.value, 9) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index d16a581ce5..3bed440150 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -158,11 +158,7 @@ def test_configuration(self): cf.configuration(of_fraction=0.0) with self.assertRaises(ValueError): cf.configuration(free_memory_factor=0.0) - new_values = { - "tempdir": "", - "atol": 0.0, - "regrid_logging": False, - } + new_values = {"tempdir": "", "atol": 0.0, "regrid_logging": False} cf.configuration(**new_values) post_set = cf.configuration() for name, val in new_values.items(): # test values that should change diff --git a/cf/test/test_read_write.py b/cf/test/test_read_write.py index b0e427f2cc..3f907872df 100644 --- a/cf/test/test_read_write.py +++ b/cf/test/test_read_write.py @@ -67,10 +67,7 @@ class read_writeTest(unittest.TestCase): "NETCDF3_64BIT_OFFSET", "NETCDF3_64BIT_DATA", ] - netcdf4_fmts = [ - "NETCDF4", - "NETCDF4_CLASSIC", - ] + netcdf4_fmts = ["NETCDF4", "NETCDF4_CLASSIC"] netcdf_fmts = netcdf3_fmts + netcdf4_fmts def test_write_filename(self): @@ -562,13 +559,7 @@ def test_read_write_netCDF4_compress_shuffle(self): with cf.chunksize(chunksize): f = cf.read(self.filename)[0] for fmt in ("NETCDF4", "NETCDF4_CLASSIC", "CFA4"): - cf.write( - f, - tmpfile, - fmt=fmt, - compress=1, - shuffle=True, - ) + cf.write(f, tmpfile, fmt=fmt, compress=1, shuffle=True) g = cf.read(tmpfile)[0] self.assertTrue( f.equals(g, verbose=2), @@ -759,11 +750,7 @@ def test_read_cdl_string(self): command_to_run = ["ncdump", self.filename, ">", tempf] if option: command_to_run.insert(1, option) - subprocess.run( - " ".join(command_to_run), - shell=True, - check=True, - ) + subprocess.run(" ".join(command_to_run), shell=True, check=True) with open(tempf, "r") as file: cdl_string_1 = file.read() diff --git a/cf/test/test_style.py b/cf/test/test_style.py index 049febad91..923b23c505 100644 --- a/cf/test/test_style.py +++ b/cf/test/test_style.py @@ -35,10 +35,7 @@ def test_pep8_compliance(self): pep8_check = pycodestyle.StyleGuide() # Directories to skip in the recursive walk of the directory: - skip_dirs = ( - "__pycache__", - "c-lib", - ) + skip_dirs = ("__pycache__", "c-lib") # These are pycodestyle errors and warnings to explicitly ignore. For # descriptions for each code see: # https://pep8.readthedocs.io/en/latest/intro.html#error-codes diff --git a/cf/umread_lib/cInterface.py b/cf/umread_lib/cInterface.py index bada61f95c..45a377a27f 100644 --- a/cf/umread_lib/cInterface.py +++ b/cf/umread_lib/cInterface.py @@ -403,10 +403,7 @@ def get_type_and_num_words(self, int_hdr): data_type = CT.c_int() num_words = CT.c_size_t() rv = self.lib.get_type_and_num_words( - word_size, - int_hdr, - CT.pointer(data_type), - CT.pointer(num_words), + word_size, int_hdr, CT.pointer(data_type), CT.pointer(num_words) ) if rv != 0: raise umfile.UMFileException(