From ddbcfdb4fbb6242e9285a17a029e8a7c6b12e518 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 20 Apr 2022 13:26:55 +0100 Subject: [PATCH 1/9] memory/disk --- cf/data/data.py | 155 +++++++--------------------------- cf/data/mixin/deprecations.py | 95 ++++++++++++++++++++- 2 files changed, 124 insertions(+), 126 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 1e8cb5c2f9..9c030b384c 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -303,7 +303,7 @@ def __init__( copy=True, dtype=None, mask=None, - persist=False, + to_memory=False, init_options=None, _use_array=True, ): @@ -424,18 +424,26 @@ def __init__( .. versionadded:: TODODASK - persist: `bool`, optional - If True then persist the underlying array into memory, - equivalent to calling `persist` on the data - immediately after initialisation. + to_memory: `bool`, optional + If True then ensure that the original data are in + memory, rather than on disk. - If the original data are on disk then reading data + If the original data are on disk, then reading data into memory during initialisation will slow down the initialisation process, but can considerably improve downstream performance by avoiding the need for independent reads for every dask chunk, each time the data are computed. + In geneal, setting *to_memory* to True is not the same + as calling the `persist` of the newly created `Data` + object, which also decompresses data compressed by + convention and computes any data type, mask and + date-time modifications. + + If the input *array* is a `dask.array.Array` object + then *to_memory* is ignored. + .. versionadded:: TODODASK init_options: `dict`, optional @@ -597,11 +605,20 @@ def __init__( "for compressed input arrays" ) + # Bring the compressed data into memory without + # decompressing it + if to_memory: + try: + array = array.to_memory() + except AttributeError: + pass + # Save the input compressed array, as this will contain # extra information, such as a count or index variable. self._set_Array(array) array = compressed_to_dask(array, chunks) + elif not is_dask_collection(array): # Turn the data into a dask array kwargs = init_options.get("from_array", {}) @@ -612,6 +629,13 @@ def __init__( "Use the 'chunks' parameter instead." ) + # Bring the data into memory + if to_memory: + try: + array = array.to_memory() + except AttributeError: + pass + array = to_dask(array, chunks, **kwargs) elif chunks != _DEFAULT_CHUNKS: @@ -655,21 +679,6 @@ def __init__( if mask is not None: self.where(mask, cf_masked, inplace=True) - # Bring the data into memory - if persist: - self.persist(inplace=True) - - # @property# - # def dask_array(s#elf): - # """TODODASK.## - # - # :Returns: - # - # `dask.array.Array`## - # - # """ - # return self.get_dask(copy=True) - @property def dask_compressed_array(self): """TODODASK. @@ -10547,99 +10556,6 @@ def override_calendar(self, calendar, inplace=False, i=False): return d - def to_disk(self): - """Store the data array on disk. - - There is no change to partition's whose sub-arrays are already on - disk. - - :Returns: - - `None` - - **Examples:** - - >>> d.to_disk() - - """ - print("TODODASK - ???") - config = self.partition_configuration(readonly=True, to_disk=True) - - for partition in self.partitions.matrix.flat: - if partition.in_memory: - partition.open(config) - partition.array - partition.close() - - def to_memory(self, regardless=False, parallelise=False): - """Store each partition's data in memory in place if the master - array is smaller than the chunk size. - - There is no change to partitions with data that are already in memory. - - :Parameters: - - regardless: `bool`, optional - If True then store all partitions' data in memory - regardless of the size of the master array. By default - only store all partitions' data in memory if the master - array is smaller than the chunk size. - - parallelise: `bool`, optional - If True than only move those partitions to memory that are - flagged for processing on this rank. - - :Returns: - - `None` - - **Examples:** - - >>> d.to_memory() - >>> d.to_memory(regardless=True) - - """ - print("TODODASK - ???") - config = self.partition_configuration(readonly=True) - fm_threshold = cf_fm_threshold() - - # If parallelise is False then all partitions are flagged for - # processing on this rank, otherwise only a subset are - self._flag_partitions_for_processing(parallelise) - - for partition in self.partitions.matrix.flat: - if partition._process_partition: - # Only move the partition to memory if it is flagged - # for processing - partition.open(config) - if ( - partition.on_disk - and partition.nbytes <= free_memory() - fm_threshold - ): - partition.array - - partition.close() - # --- End: for - - @property - def in_memory(self): - """True if the array is retained in memory. - - :Returns: - - **Examples:** - - >>> d.in_memory - - """ - print("TODODASK - ???") - for partition in self.partitions.matrix.flat: - if not partition.in_memory: - return False - # --- End: for - - return True - @daskified(_DASKIFIED_VERBOSE) def datum(self, *index): """Return an element of the data array as a standard Python @@ -11605,17 +11521,6 @@ def swapaxes(self, axis0, axis1, inplace=False, i=False): d._set_dask(dx, reset_mask_hardness=False) return d - def save_to_disk(self, itemsize=None): - """cf.Data.save_to_disk is dead. - - Use not cf.Data.fits_in_memory instead. - - """ - raise NotImplementedError( - "cf.Data.save_to_disk is dead. Use not " - "cf.Data.fits_in_memory instead." - ) - def fits_in_memory(self, itemsize): """Return True if the master array is small enough to be retained in memory. diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py index 55c1ac6067..20026be866 100644 --- a/cf/data/mixin/deprecations.py +++ b/cf/data/mixin/deprecations.py @@ -81,8 +81,29 @@ def dtvarray(self): """Deprecated at version 3.0.0.""" _DEPRECATION_ERROR_ATTRIBUTE(self, "dtvarray") # pragma: no cover + @property + def in_memory(self): + """True if the array is retained in memory. + + Deprecated at version TODODASK. + + :Returns: + + **Examples** + + >>> d.in_memory + + """ + _DEPRECATION_ERROR_ATTRIBUTE( + self, + "in_memory", + version="TODODASK", + removed_at="5.0.0", + ) # pragma: no cover + def files(self): - """Deprecated at version 3.4.0, use method `get_` instead.""" + """Deprecated at version 3.4.0, use method `get_filenames` + instead.""" _DEPRECATION_ERROR_METHOD( self, "files", @@ -361,3 +382,75 @@ def partition_boundaries(self): _DEPRECATION_ERROR_METHOD( "TODODASK - consider using 'chunks' instead" ) # pragma: no cover + + def save_to_disk(self, itemsize=None): + """Deprecated.""" + _DEPRECATION_ERROR_METHOD( + self, + "save_to_disk", + removed_at="4.0.0", + ) # pragma: no cover + + def to_disk(self): + """Store the data array on disk. + + Deprecated at version TODODASK. + + There is no change to partition's whose sub-arrays are already + on disk. + + :Returns: + + `None` + + **Examples** + + >>> d.to_disk() + + """ + _DEPRECATION_ERROR_METHOD( + self, + "to_disk", + version="TODODASK", + removed_at="5.0.0", + ) # pragma: no cover + + def to_memory(self, regardless=False, parallelise=False): + """Store each partition's data in memory in place if the master + array is smaller than the chunk size. + + Deprecated at version TODODASK. Consider using `persist` + instead. + + There is no change to partitions with data that are already in + memory. + + :Parameters: + + regardless: `bool`, optional + If True then store all partitions' data in memory + regardless of the size of the master array. By default + only store all partitions' data in memory if the + master array is smaller than the chunk size. + + parallelise: `bool`, optional + If True than only move those partitions to memory that + are flagged for processing on this rank. + + :Returns: + + `None` + + **Examples** + + >>> d.to_memory() + >>> d.to_memory(regardless=True) + + """ + _DEPRECATION_ERROR_METHOD( + self, + "to_memory", + message="Consider using 'persist' instead.", + version="TODODASK", + removed_at="5.0.0", + ) # pragma: no cover From 2d3fe9c6bafb5c1ec9e8f953bb705d88336baae8 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 21 Apr 2022 09:09:46 +0100 Subject: [PATCH 2/9] fix merge conflict resolution --- cf/__init__.py | 19 +- cf/data/abstract/compressedsubarray.py | 6 +- cf/data/cachedarray.py | 2 +- cf/data/collapse.py | 2125 ++++++++++++++ cf/data/collapse_functions.py | 1379 --------- cf/data/dask_utils.py | 30 +- cf/data/data.py | 3707 +++++++++++------------- cf/data/functions.py | 2 +- cf/data/mixin/deprecations.py | 37 +- cf/data/partition.py | 54 +- cf/data/utils.py | 8 +- cf/docstring/docstring.py | 112 +- cf/formula_terms.py | 36 +- cf/test/test_Data.py | 1649 +++++------ docs/Makefile | 2 +- docs/source/field_analysis.rst | 161 +- requirements.txt | 1 + 17 files changed, 4900 insertions(+), 4430 deletions(-) create mode 100644 cf/data/collapse.py delete mode 100644 cf/data/collapse_functions.py diff --git a/cf/__init__.py b/cf/__init__.py index 8450c86f4d..99fbb6cd22 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -78,7 +78,7 @@ __date__ = "2022-01-18" __version__ = "4.0.0b0" -_requires = ("numpy", "netCDF4", "cftime", "cfunits", "cfdm", "psutil") +_requires = ("numpy", "netCDF4", "cftime", "cfunits", "cfdm", "psutil", "dask") x = ", ".join(_requires) _error0 = f"cf v{ __version__} requires the modules {x}. " @@ -104,10 +104,6 @@ _found_ESMF = bool(importlib.util.find_spec("ESMF")) -# TODODASK - Remove the next 2 lines when the move to dask is complete -mpi_on = False -mpi_size = 1 - try: import netCDF4 except ImportError as error1: @@ -133,6 +129,11 @@ except ImportError as error1: raise ImportError(_error0 + str(error1)) +try: + import dask +except ImportError as error1: + raise ImportError(_error0 + str(error1)) + # Check the version of psutil _minimum_vn = "0.6.0" if LooseVersion(psutil.__version__) < LooseVersion(_minimum_vn): @@ -183,6 +184,14 @@ f"Got {_cfdm_version} at {cfdm.__file__}" ) +# Check the version of dask +_minimum_vn = "2022.03.0" +if LooseVersion(dask.__version__) < LooseVersion(_minimum_vn): + raise RuntimeError( + f"Bad dask version: cf requires dask>={_minimum_vn}. " + f"Got {dask.__version__} at {dask.__file__}" + ) + from .constructs import Constructs from .mixin import Coordinate diff --git a/cf/data/abstract/compressedsubarray.py b/cf/data/abstract/compressedsubarray.py index 6e49cdacc5..9ffbd8f5c0 100644 --- a/cf/data/abstract/compressedsubarray.py +++ b/cf/data/abstract/compressedsubarray.py @@ -78,7 +78,7 @@ def file(self): """The file on disk which contains the compressed array, or `None` of the array is in memory. - **Examples:** + **Examples** >>> self.file '/home/foo/bar.nc' @@ -96,7 +96,7 @@ def close(self): `None` - **Examples:** + **Examples** >>> f.close() @@ -127,7 +127,7 @@ def on_disk(self): """True if and only if the compressed array is on disk as opposed to in memory. - **Examples:** + **Examples** >>> a.on_disk() True diff --git a/cf/data/cachedarray.py b/cf/data/cachedarray.py index 770c1c658a..f3301e8744 100644 --- a/cf/data/cachedarray.py +++ b/cf/data/cachedarray.py @@ -29,7 +29,7 @@ def __init__(self, array): array: numpy array The array to be stored on disk in a temporary file. - **Examples:** + **Examples** >>> f = CachedArray(numpy.array([1, 2, 3, 4, 5])) >>> f = CachedArray(numpy.ma.array([1, 2, 3, 4, 5])) diff --git a/cf/data/collapse.py b/cf/data/collapse.py new file mode 100644 index 0000000000..5b938f99a1 --- /dev/null +++ b/cf/data/collapse.py @@ -0,0 +1,2125 @@ +"""Functions used during `Data` object collapses.""" +import inspect +from functools import partial, reduce +from operator import mul + +import numpy as np +from cfdm.core import DocstringRewriteMeta +from dask.array import chunk +from dask.array.core import _concatenate2 +from dask.array.reductions import divide, numel, reduction +from dask.core import flatten +from dask.utils import deepmap + +from ..docstring import _docstring_substitution_definitions + + +class Collapse(metaclass=DocstringRewriteMeta): + """Container for functions that collapse dask arrays. + + .. versionadded:: TODODASK + + """ + + def __docstring_substitutions__(self): + """Define docstring substitutions that apply to this class and + all of its subclasses. + + These are in addtion to, and take precendence over, docstring + substitutions defined by the base classes of this class. + + See `_docstring_substitutions` for details. + + .. versionadded:: TODODASK + + .. seealso:: `_docstring_substitutions` + + :Returns: + + `dict` + The docstring substitutions that have been applied. + + """ + return _docstring_substitution_definitions + + def __docstring_package_depth__(self): + """Return the package depth for "package" docstring + substitutions. + + See `_docstring_package_depth` for details. + + .. versionadded:: TODODASK + + """ + return 0 + + @classmethod + def max(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): + """Return maximum values of an array. + + Calculates the maximum value of an array or the maximum values + along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = a.dtype + return reduction( + a, + cf_max_chunk, + partial(cf_max_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_max_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @classmethod + def max_abs( + cls, a, axis=None, keepdims=False, mtol=None, split_every=None + ): + """Return maximum absolute values of an array. + + Calculates the maximum absolute value of an array or the + maximum absolute values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + return cls.max( + abs(a), + axis=axis, + keepdims=keepdims, + mtol=mtol, + split_every=split_every, + ) + + @classmethod + def mean( + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + split_every=None, + ): + """Return mean values of an array. + + Calculates the mean value of an array or the mean values along + axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = "f8" + return reduction( + a, + cf_mean_chunk, + partial(cf_mean_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_mean_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @classmethod + def mean_abs( + cls, + a, + weights=None, + axis=None, + keepdims=False, + mtol=None, + split_every=None, + ): + """Return mean absolute values of an array. + + Calculates the mean absolute value of an array or the mean + absolute values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + return cls.mean( + abs(a), + weights=weights, + axis=axis, + keepdims=keepdims, + mtol=mtol, + split_every=split_every, + ) + + @classmethod + def mid_range( + cls, + a, + axis=None, + dtype=None, + keepdims=False, + mtol=None, + split_every=None, + ): + """Return mid-range values of an array. + + Calculates the mid-range value of an array or the mid-range + values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a, allowed="fi") + dtype = "f8" + return reduction( + a, + cf_range_chunk, + partial(cf_mid_range_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_range_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @classmethod + def min(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): + """Return minimum values of an array. + + Calculates the minimum value of an array or the minimum values + along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = a.dtype + return reduction( + a, + cf_min_chunk, + partial(cf_min_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_min_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @classmethod + def min_abs( + cls, a, axis=None, keepdims=False, mtol=None, split_every=None + ): + """Return minimum absolute values of an array. + + Calculates the minimum absolute value of an array or the + minimum absolute values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + return cls.min( + abs(a), + axis=axis, + keepdims=keepdims, + mtol=mtol, + split_every=split_every, + ) + + @classmethod + def range(cls, a, axis=None, keepdims=False, mtol=None, split_every=None): + """Return range values of an array. + + Calculates the range value of an array or the range values + along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a, allowed="fi") + dtype = a.dtype + return reduction( + a, + cf_range_chunk, + partial(cf_range_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_range_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @classmethod + def rms( + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + split_every=None, + ): + """Return root mean square (RMS) values of an array. + + Calculates the RMS value of an array or the RMS values along + axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = "f8" + return reduction( + a, + cf_rms_chunk, + partial(cf_rms_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_mean_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @classmethod + def sample_size( + cls, a, axis=None, keepdims=False, mtol=None, split_every=None + ): + """Return sample size values of an array. + + Calculates the sample size value of an array or the sample + size values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = "i8" + return reduction( + a, + cf_sample_size_chunk, + partial(cf_sample_size_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sample_size_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + ) + + @classmethod + def sum( + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + split_every=None, + ): + """Return sum values of an array. + + Calculates the sum value of an array or the sum values along + axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = double_precision_dtype(a) + if weights is not None: + dtype = np.result_type(double_precision_dtype(weights), dtype) + + return reduction( + a, + cf_sum_chunk, + partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sum_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @classmethod + def sum_of_weights( + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + split_every=None, + ): + """Return sum of weights values for an array. + + Calculates the sum of weights value for an array or the sum of + weights values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = double_precision_dtype(weights, default="i8") + return reduction( + a, + cf_sum_of_weights_chunk, + partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sum_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @classmethod + def sum_of_weights2( + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + split_every=None, + ): + """Return sum of squares of weights values for an array. + + Calculates the sum of squares of weights value for an array or + the sum of squares of weights values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = double_precision_dtype(weights, default="i8") + return reduction( + a, + partial(cf_sum_of_weights_chunk, square=True), + partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_sum_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + @classmethod + def var( + cls, + a, + axis=None, + weights=None, + keepdims=False, + mtol=None, + ddof=None, + split_every=None, + ): + """Return variances of an array. + + Calculates the variance value of an array or the variance + values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + .. versionadded:: TODODASK + + :Parameters: + + a: `dask.array.Array` + The array to be collapsed. + + {{Collapse weights: data_like or `None`, optional}} + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse keepdims: `bool`, optional}} + + {{mtol: number, optional}} + + {{ddof: number}} + + {{split_every: `int` or `dict`, optional}} + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + check_input_dtype(a) + dtype = "f8" + return reduction( + a, + partial(cf_var_chunk, ddof=ddof), + partial(cf_var_agg, mtol=mtol, original_shape=a.shape), + axis=axis, + keepdims=keepdims, + dtype=dtype, + split_every=split_every, + combine=cf_var_combine, + concatenate=False, + meta=np.array((), dtype=dtype), + weights=weights, + ) + + +def check_input_dtype(a, allowed="fib"): + """Check that data has a data type allowed by a collapse method. + + The collapse method is assumed to be defined by the calling + function. + + :Parameters: + + a: `dask.array.Array` + The data. + + allowed: `str`, optional + The data type kinds allowed by the collapse + method. Defaults to ``'fib'``, meaning that only float, + integer and Boolean data types are allowed. + + :Returns: + + `None` + + """ + if a.dtype.kind not in allowed: + method = inspect.currentframe().f_back.f_code.co_name + raise TypeError(f"Can't calculate {method} of data with {a.dtype!r}") + + +def double_precision_dtype(a, default=None, bool_type="i"): + """Returns the corresponding double precision data type of an array. + + :Parameters: + + a: `dask.array.Array` or `None` + The data. If `None` then the value of *default* is + returned*. + + default: `str`, optional + If *a* is `None`, then return this data type. + + bool_type: `str`, optional + The corresponding double data type kind for Boolean + data. Defaults to ``'i'``, meaning ``'i8'`` is + returned. Set to ``'f'` to return ``'f8'`` instead. + + :Returns: + + `str` + The double precision type. + + **Examples** + + >>> for d in (int, 'int32', float, 'float32', bool): + ... print(double_precision_dtype(np.array(1, dtype=d))) + ... + i8 + i8 + f8 + f8 + i8 + + >>> double_precision_dtype(np.array(1, dtype=bool), bool_type='f') + 'f8' + >>> double_precision_dtype(None, default="i8") + 'i8' + + """ + if a is None: + return default + + kind = a.dtype.kind + if kind == "b": + return bool_type + "8" + + if kind in "fi": + return kind + "8" + + raise TypeError(f"Can't collapse data with {a.dtype!r}") + + +def mask_small_sample_size(x, N, axis, mtol, original_shape): + """Mask elements where the sample size is below a threshold. + + .. versionadded:: TODODASK + + :Parameters: + + x: `numpy.ndarray` + The collapsed data. + + N: `numpy.ndarray` + The sample sizes of the collapsed values. + + axis: sequence of `int` + The axes being collapsed. + + mtol: number + The sample size threshold below which collapsed values are + set to missing data. It is defined as a fraction (between + 0 and 1 inclusive) of the contributing input data values. + + The default of *mtol* is 1, meaning that a missing datum + in the output array occurs whenever all of its + contributing input array elements are missing data. + + For other values, a missing datum in the output array + occurs whenever more than ``100*mtol%`` of its + contributing input array elements are missing data. + + Note that for non-zero values of *mtol*, different + collapsed elements may have different sample sizes, + depending on the distribution of missing data in the input + data. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + :Returns: + + `numpy.ndarray` + Array *x* masked where *N* is sufficiently small. Note + that the input *x* might be modified in-place with the + contents of the output. + + """ + if not x.ndim: + # Make sure that we have a numpy array (e.g. as opposed to a + # numpy.float64) + x = np.asanyarray(x) + + if mtol < 1: + # Nmax = total number of elements, including missing values + Nmax = reduce(mul, [original_shape[i] for i in axis], 1) + x = np.ma.masked_where(N < (1 - mtol) * Nmax, x, copy=False) + + return x + + +def sum_weights_chunk(x, weights=None, square=False, N=None, **kwargs): + """Sum the weights. + + .. versionadded:: TODODASK + + :Parameters: + + x: `numpy.ndarray` + The data. + + weights: `numpy.ndarray`, optional + The weights associated with values of the data. Must have + the same shape as *x*. By default *weights* is `None`, + meaning that all non-missing elements of the data have a + weight of 1 and all missing elements have a weight of + 0. If given as an array then those weights that are + missing data, or that correspond to the missing elements + of the data, are assigned a weight of 0. + + square: `bool`, optional + If True calculate the sum of the squares of the weights. + + N: `numpy.ndarray`, optional + The sample size. If provided as an array and there are no + weights, then the *N* is returned instead of calculating + the sum (of the squares) of weights. Ignored of *weights* + is not `None`. + + :Returns: + + `numpy.ndarray` + The sum of the weights, with data type "i8" or "f8". + + """ + if weights is None: + # All weights are 1, so the sum of the weights and the sum of + # the squares of the weights are both equal to the sample + # size. + if N is None: + N = cf_sample_size_chunk(x, **kwargs)["N"] + + return N + + dtype = double_precision_dtype(weights) + if square: + weights = np.multiply(weights, weights, dtype=dtype) + + if np.ma.is_masked(x): + weights = np.ma.masked_where(x.mask, weights) + + return chunk.sum(weights, dtype=dtype, **kwargs) + + +def combine_arrays( + pairs, key, func, axis, dtype=None, computing_meta=False, **kwargs +): + """Worker function for Combine callables. + + Select arrays by dictionary key from a nested list of + dictionaries, concatenate the resulting nested list of arrays + along the specified axes, and then apply a function to the result + along those same axes. + + See `dask.array.reductions.mean_combine` for an example. + + .. versionadded:: TODODASK + + :Returns: + + `numpy.ndarray` + + """ + x = deepmap(lambda pair: pair[key], pairs) if not computing_meta else pairs + + if dtype: + kwargs["dtype"] = dtype + + x = _concatenate2(x, axes=axis) + return func(x, axis=axis, **kwargs) + + +def sum_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): + """Alias of `combine_arrays` with ``func=chunk.sum``. + + .. versionadded:: TODODASK + + """ + return combine_arrays( + pairs, key, chunk.sum, axis, dtype, computing_meta, **kwargs + ) + + +def max_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): + """Alias of `combine_arrays` with ``func=chunk.max``. + + .. versionadded:: TODODASK + + """ + return combine_arrays( + pairs, key, chunk.max, axis, dtype, computing_meta, **kwargs + ) + + +def min_arrays(pairs, key, axis, dtype, computing_meta=False, **kwargs): + """Alias of `combine_arrays` with ``func=chunk.min``. + + .. versionadded:: TODODASK + + """ + return combine_arrays( + pairs, key, chunk.min, axis, dtype, computing_meta, **kwargs + ) + + +def sum_sample_sizes(pairs, axis, computing_meta=False, **kwargs): + """Alias of `combine_arrays` with ``key="N", func=chunk.sum, + dtype="i8"``. + + .. versionadded:: TODODASK + + """ + return combine_arrays( + pairs, + "N", + chunk.sum, + axis, + dtype="i8", + computing_meta=computing_meta, + **kwargs, + ) + + +# -------------------------------------------------------------------- +# mean +# -------------------------------------------------------------------- +def cf_mean_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): + """Chunk calculations for the mean. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * V1: The sum of ``weights`` (equal to ``N`` if weights + are not set). + * sum: The weighted sum of ``x``. + * weighted: True if weights have been set. + + """ + if computing_meta: + return x + + # N, sum + d = cf_sum_chunk(x, weights, dtype=dtype, **kwargs) + + d["V1"] = sum_weights_chunk(x, weights, N=d["N"], **kwargs) + d["weighted"] = weights is not None + + return d + + +def cf_mean_combine( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + **kwargs, +): + """Combination calculations for the mean. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_mean_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + weighted = next(flatten(pairs))["weighted"] + d = {"weighted": weighted} + + d["sum"] = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d["sum"] + + d["N"] = sum_sample_sizes(pairs, axis, **kwargs) + if weighted: + d["V1"] = sum_arrays(pairs, "V1", axis, dtype, **kwargs) + else: + d["V1"] = d["N"] + + return d + + +def cf_mean_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the mean. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_mean_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + x = divide(d["sum"], d["V1"], dtype=dtype) + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# maximum +# -------------------------------------------------------------------- +def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Chunk calculations for the maximum. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * max: The maximum of `x``. + + """ + if computing_meta: + return x + + return { + "max": chunk.max(x, **kwargs), + "N": cf_sample_size_chunk(x, **kwargs)["N"], + } + + +def cf_max_combine( + pairs, + axis=None, + computing_meta=False, + **kwargs, +): + """Combination calculations for the maximum. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_max_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + mx = max_arrays(pairs, "max", axis, None, computing_meta, **kwargs) + if computing_meta: + return mx + + return { + "max": mx, + "N": sum_sample_sizes(pairs, axis, **kwargs), + } + + +def cf_max_agg( + pairs, + axis=None, + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the maximum. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_max_combine(pairs, axis, computing_meta, **kwargs) + if computing_meta: + return d + + x = d["max"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# mid-range +# -------------------------------------------------------------------- +def cf_mid_range_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the mid-range. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_range_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + # Calculate the mid-range + x = divide(d["max"] + d["min"], 2.0, dtype=dtype) + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# minimum +# -------------------------------------------------------------------- +def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Chunk calculations for the minimum. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * min: The minimum of ``x``. + + """ + if computing_meta: + return x + + return { + "min": chunk.min(x, **kwargs), + "N": cf_sample_size_chunk(x, **kwargs)["N"], + } + + +def cf_min_combine( + pairs, + axis=None, + computing_meta=False, + **kwargs, +): + """Combination calculations for the minimum. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_min_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + mn = min_arrays(pairs, "min", axis, None, computing_meta, **kwargs) + if computing_meta: + return mn + + return { + "min": mn, + "N": sum_sample_sizes(pairs, axis, **kwargs), + } + + +def cf_min_agg( + pairs, + axis=None, + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the minimum. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_min_combine(pairs, axis, computing_meta, **kwargs) + if computing_meta: + return d + + x = d["min"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# range +# -------------------------------------------------------------------- +def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): + """Chunk calculations for the range. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * min: The minimum of ``x``. + * max: The maximum of ``x`. + + """ + if computing_meta: + return x + + # N, max + d = cf_max_chunk(x, **kwargs) + + d["min"] = chunk.min(x, **kwargs) + return d + + +def cf_range_combine( + pairs, + axis=None, + dtype=None, + computing_meta=False, + **kwargs, +): + """Combination calculations for the range. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_range_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + mx = max_arrays(pairs, "max", axis, None, computing_meta, **kwargs) + if computing_meta: + return mx + + mn = min_arrays(pairs, "min", axis, None, **kwargs) + + return { + "max": mx, + "min": mn, + "N": sum_sample_sizes(pairs, axis, **kwargs), + } + + +def cf_range_agg( + pairs, + axis=None, + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the range. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_range_combine(pairs, axis, computing_meta, **kwargs) + if computing_meta: + return d + + # Calculate the range + x = d["max"] - d["min"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# root mean square +# -------------------------------------------------------------------- +def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): + """Chunk calculations for the root mean square (RMS). + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * sum: The weighted sum of ``x**2``. + + """ + if computing_meta: + return x + + return cf_mean_chunk( + np.multiply(x, x, dtype=dtype), weights=weights, dtype=dtype, **kwargs + ) + + +def cf_rms_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the root mean square (RMS). + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_mean_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + x = np.sqrt(d["sum"] / d["V1"], dtype=dtype) + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# sample size +# -------------------------------------------------------------------- +def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): + """Chunk calculations for the sample size. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + + """ + if computing_meta: + return x + + if np.ma.isMA(x): + N = chunk.sum(np.ones_like(x, dtype=dtype), **kwargs) + else: + if dtype: + kwargs["dtype"] = dtype + + N = numel(x, **kwargs) + + return {"N": N} + + +def cf_sample_size_combine( + pairs, + axis=None, + dtype="i8", + computing_meta=False, + **kwargs, +): + """Combination calculations for the sample size. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_sample_size_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + x = sum_arrays(pairs, "N", axis, dtype, computing_meta, **kwargs) + if computing_meta: + return x + + return {"N": x} + + +def cf_sample_size_agg( + pairs, + axis=None, + computing_meta=False, + dtype="i8", + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the sample size. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_sample_size_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + x = d["N"] + x = mask_small_sample_size(x, x, axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# sum +# -------------------------------------------------------------------- +def cf_sum_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): + """Chunk calculations for the sum. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * sum: The weighted sum of ``x`` + + """ + if computing_meta: + return x + + if weights is not None: + x = np.multiply(x, weights, dtype=dtype) + + d = cf_sample_size_chunk(x, **kwargs) + d["sum"] = chunk.sum(x, dtype=dtype, **kwargs) + return d + + +def cf_sum_combine( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + **kwargs, +): + """Combination calculations for the sum. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_sum_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + x = sum_arrays(pairs, "sum", axis, dtype, computing_meta, **kwargs) + if computing_meta: + return x + + return { + "sum": x, + "N": sum_sample_sizes(pairs, axis, **kwargs), + } + + +def cf_sum_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the sum. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_sum_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + x = d["sum"] + x = mask_small_sample_size(x, d["N"], axis, mtol, original_shape) + return x + + +# -------------------------------------------------------------------- +# sum of weights +# -------------------------------------------------------------------- +def cf_sum_of_weights_chunk( + x, weights=None, dtype="f8", computing_meta=False, square=False, **kwargs +): + """Chunk calculations for the sum of the weights. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + :Parameters: + + square: `bool`, optional + If True then calculate the sum of the squares of the + weights. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * sum: The sum of ``weights``, or the sum of + ``weights**2`` if *square* is True. + + """ + if computing_meta: + return x + + # N + d = cf_sample_size_chunk(x, **kwargs) + + d["sum"] = sum_weights_chunk( + x, weights=weights, square=square, N=d["N"], **kwargs + ) + + return d + + +# -------------------------------------------------------------------- +# variance +# -------------------------------------------------------------------- +def cf_var_chunk( + x, weights=None, dtype="f8", computing_meta=False, ddof=None, **kwargs +): + """Chunk calculations for the variance. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + See + https://en.wikipedia.org/wiki/Pooled_variance#Sample-based_statistics + for details. + + .. versionadded:: TODODASK + + :Parameters: + + ddof: number + The delta degrees of freedom. The number of degrees of + freedom used in the calculation is (N-*ddof*) where N + represents the number of non-missing elements. A value of + 1 applies Bessel's correction. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * V1: The sum of ``weights`` (equal to ``N`` if weights + are not set). + * V2: The sum of ``weights**2``, or `None` of not + required. + * sum: The weighted sum of ``x``. + * part: ``V1 * (sigma**2 + mu**2)``, where ``sigma**2`` is + the weighted biased (i.e. ``ddof=0``) variance of + ``x``, and ``mu`` is the weighted mean of ``x``. + * weighted: True if weights have been set. + * ddof: The delta degrees of freedom. + + """ + if computing_meta: + return x + + weighted = weights is not None + + # N, V1, sum + d = cf_mean_chunk(x, weights, dtype=dtype, **kwargs) + + wsum = d["sum"] + V1 = d["V1"] + + avg = divide(wsum, V1, dtype=dtype) + part = x - avg + part *= part + if weighted: + part = part * weights + + part = chunk.sum(part, dtype=dtype, **kwargs) + part = part + avg * wsum + + d["part"] = part + + if weighted and ddof == 1: + d["V2"] = sum_weights_chunk(x, weights, square=True, **kwargs) + else: + d["V2"] = None + + d["weighted"] = weighted + d["ddof"] = ddof + + return d + + +def cf_var_combine( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + **kwargs, +): + """Combination calculations for the variance. + + This function is passed to `dask.array.reduction` as its *combine* + parameter. + + .. versionadded:: TODODASK + + :Parameters: + + See `dask.array.reductions` for details of the parameters. + + :Returns: + + As for `cf_var_chunk`. + + """ + if not isinstance(pairs, list): + pairs = [pairs] + + d = next(flatten(pairs)) + weighted = d["weighted"] + ddof = d["ddof"] + d = {"weighted": weighted, "ddof": ddof} + + d["part"] = sum_arrays( + pairs, "part", axis, dtype, computing_meta, **kwargs + ) + if computing_meta: + return d["part"] + + d["sum"] = sum_arrays(pairs, "sum", axis, dtype, **kwargs) + + d["N"] = sum_sample_sizes(pairs, axis, **kwargs) + d["V1"] = d["N"] + d["V2"] = None + if weighted: + d["V1"] = sum_arrays(pairs, "V1", axis, dtype, **kwargs) + if ddof == 1: + d["V2"] = sum_arrays(pairs, "V2", axis, dtype, **kwargs) + + return d + + +def cf_var_agg( + pairs, + axis=None, + dtype="f8", + computing_meta=False, + mtol=None, + original_shape=None, + **kwargs, +): + """Aggregation calculations for the variance. + + This function is passed to `dask.array.reduction` as its + *aggregate* parameter. + + .. note:: Weights are interpreted as reliability weights, as + opposed to frequency weights. + + See + https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights + for details. + + .. versionadded:: TODODASK + + :Parameters: + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. See `mask_small_sample_size` for + details. + + original_shape: `tuple` + The shape of the original, uncollapsed data. + + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dask.array.Array` + The collapsed array. + + """ + d = cf_var_combine(pairs, axis, dtype, computing_meta, **kwargs) + if computing_meta: + return d + + ddof = d["ddof"] + V1 = d["V1"] + wsum = d["sum"] + var = d["part"] - wsum * wsum / V1 + + # Note: var is now the global value of V1 * sigma**2, where sigma + # is the global weighted biased (i.e. ddof=0) variance. + + if ddof is None: + raise ValueError(f"Must set ddof to a numeric value. Got: {ddof!r}") + + if not ddof: + # Weighted or unweighted variance with ddof=0 + f = 1 / V1 + elif not d["weighted"]: + # Unweighted variance with any non-zero value of ddof + f = 1 / (V1 - ddof) + elif ddof == 1: + # Weighted variance with ddof=1 + f = V1 / (V1 * V1 - d["V2"]) + else: + raise ValueError( + "Can only calculate a weighted variance with ddof=0 or ddof=1. " + f"Got: {ddof!r}" + ) + + # Now get the required global variance with the requested ddof + var = f * var + + var = mask_small_sample_size(var, d["N"], axis, mtol, original_shape) + return var diff --git a/cf/data/collapse_functions.py b/cf/data/collapse_functions.py deleted file mode 100644 index 79017e4ace..0000000000 --- a/cf/data/collapse_functions.py +++ /dev/null @@ -1,1379 +0,0 @@ -from functools import partial as functools_partial - -from numpy import abs as numpy_abs -from numpy import amax as numpy_amax -from numpy import amin as numpy_amin -from numpy import array as numpy_array -from numpy import asanyarray as numpy_asanyarray -from numpy import average as numpy_average -from numpy import bool_ as numpy_bool_ -from numpy import empty as numpy_empty -from numpy import expand_dims as numpy_expand_dims -from numpy import integer as numpy_integer -from numpy import maximum as numpy_maximum -from numpy import minimum as numpy_minimum -from numpy import ndim as numpy_ndim -from numpy import sum as numpy_sum -from numpy.ma import array as numpy_ma_array -from numpy.ma import average as numpy_ma_average -from numpy.ma import expand_dims as numpy_ma_expand_dims -from numpy.ma import isMA as numpy_ma_isMA -from numpy.ma import nomask as numpy_ma_nomask -from numpy.ma import where as numpy_ma_where - -from ..functions import broadcast_array - - -def asanyarray(*args): - """Return every input array as an numpy ndarray, or a subclass of. - - :Parameters: - - args: sequence of array-like input objects - - :Returns: - - `tuple` - The input objects left as, else converted to, `numpy.ndarray` - - """ - out = [] - for x in args: - if x is not None and not numpy_ndim(x): - # Make sure that we have a numpy array (as opposed to, e.g. a - # numpy.float64) - out.append(numpy_asanyarray(x)) - else: - out.append(x) - # --- End: for - - return out - - -def psum(x, y): - """Add two arrays element-wise. - - If either or both of the arrays are masked then the output array is - masked only where both input arrays are masked. - - :Parameters: - - x: numpy array-like - *Might be updated in place*. - - y: numpy array-like - Will not be updated in place. - - :Returns: - - `numpy.ndarray` - - **Examples:** - - >>> c = psum(a, b) - - """ - if numpy_ma_isMA(x): - if numpy_ma_isMA(y): - # x and y are both masked - x_mask = x.mask - x = x.filled(0) - x += y.filled(0) - x = numpy_ma_array(x, mask=x_mask & y.mask, copy=False) - else: - # Only x is masked - x = x.filled(0) - x += y - elif numpy_ma_isMA(y): - # Only y is masked - x += y.filled(0) - else: - # x and y are both unmasked - x += y - - return x - - -def pmax(x, y): - """The element-wise maximum of two arrays. - - :Parameters: - - x: array-like - May be updated in place and should not be used again. - - y: array-like - Will not be updated in place. - - :Returns: - - `numpy.ndarray` - - """ - if numpy_ma_isMA(x): - if numpy_ma_isMA(y): - # x and y are both masked - z = numpy_maximum(x, y) - z = numpy_ma_where(x.mask & ~y.mask, y, z) - x = numpy_ma_where(y.mask & ~x.mask, x, z) - if x.mask is numpy_ma_nomask: # not numpy_any(x.mask): - x = numpy_array(x) - else: - # Only x is masked - z = numpy_maximum(x, y) - x = numpy_ma_where(x.mask, y, z) - if x.mask is numpy_ma_nomask: # not numpy_any(x.mask): - x = numpy_array(x) - elif numpy_ma_isMA(y): - # Only y is masked - z = numpy_maximum(x, y) - x = numpy_ma_where(y.mask, x, z) - if x.mask is numpy_ma_nomask: # not numpy_any(x.mask): - x = numpy_array(x) - else: - # x and y are both unmasked - if not numpy_ndim(x): - # Make sure that we have a numpy array (as opposed to, - # e.g. a numpy.float64) - x = numpy_asanyarray(x) - - numpy_maximum(x, y, out=x) - - return x - - -def pmin(x, y): - """The element-wise minimum of two arrays. - - :Parameters: - - x: `numpy.ndarray` - May be updated in place and should not be used again. - - y: `numpy.ndarray` - Will not be updated in place. - - :Returns: - - `numpy.ndarray` - - """ - if numpy_ma_isMA(x): - if numpy_ma_isMA(y): - # x and y are both masked - z = numpy_minimum(x, y) - z = numpy_ma_where(x.mask & ~y.mask, y, z) - x = numpy_ma_where(y.mask & ~x.mask, x, z) - if x.mask is numpy_ma_nomask: - x = numpy_array(x) - else: - # Only x is masked - z = numpy_minimum(x, y) - x = numpy_ma_where(x.mask, y, z) - if x.mask is numpy_ma_nomask: - x = numpy_array(x) - elif numpy_ma_isMA(y): - # Only y is masked - z = numpy_minimum(x, y) - x = numpy_ma_where(y.mask, x, z) - if x.mask is numpy_ma_nomask: - x = numpy_array(x) - else: - # x and y are both unmasked - if not numpy_ndim(x): - # Make sure that we have a numpy array (as opposed to, - # e.g. a numpy.float64) - x = numpy_asanyarray(x) - - numpy_minimum(x, y, out=x) - - return x - - -def mask_where_too_few_values(Nmin, N, x): - """Mask elements of N and x where N is strictly less than Nmin. - - :Parameters: - - Nmin: `int` - - N: `numpy.ndarray` - - x: `numpy.ndarray` - - :Returns: - - (`numpy.ndarray`, `numpy.ndarray`) - A tuple containing *N* and *x*, both masked where *N* is - strictly less than *Nmin*. - - """ - if N.min() < Nmin: - mask = N < Nmin - N = numpy_ma_array(N, mask=mask, copy=False, shrink=False) - x = numpy_ma_array(x, mask=mask, copy=False, shrink=True) - - return asanyarray(N, x) - - -def double_precision(a): - """Convert the input array to double precision. - - :Parameters: - - a: `numpy.ndarray` - - :Returns: - - `numpy.ndarray` - - """ - char = a.dtype.char - if char == "f": - newtype = float - elif char == "i": - newtype = int - else: - return a - - if numpy_ma_isMA(a): - return a.astype(newtype) - else: - return a.astype(newtype, copy=False) - - -# -------------------------------------------------------------------- -# Maximum -# -------------------------------------------------------------------- -def max_f(a, axis=None, masked=False): - """Return the maximum of an array or maximum along a specified axis. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - masked: `bool` - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the maximum. - - """ - (N,) = sample_size_f(a, axis=axis, masked=masked) - amax = numpy_amax(a, axis=axis) - - if not numpy_ndim(amax): - # Make sure that we have a numpy array (as opposed to, e.g. a - # numpy.float64) - amax = numpy_asanyarray(amax) - - return asanyarray(N, amax) - - -def max_fpartial(out, out1=None, group=False): - """Return the partial maximum of an array. - - :Parameters: - - out: 2-`tuple` of `numpy.ndarray` - - out1: 2-`tuple` of `numpy.ndarray`, optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the maximum. - - """ - N, amax = out - - if out1 is not None: - N1, amax1 = out1 - N = psum(N, N1) - amax = pmax(amax, amax1) - - return asanyarray(N, amax) - - -def max_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to the maximum of an - array. - - Here mask out any values derived from a too-small sample size. - - :Parameters: - - sub_samples: *optional* - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the maximum. - - """ - return mask_where_too_few_values(1, *out) - - -# -------------------------------------------------------------------- -# Minimum -# -------------------------------------------------------------------- -def min_f(a, axis=None, masked=False): - """Return the minimum of an array or minimum along a specified axis. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - masked: `bool`, optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the minimum. - - """ - (N,) = sample_size_f(a, axis=axis, masked=masked) - amin = numpy_amin(a, axis=axis) - - return asanyarray(N, amin) - - -def min_fpartial(out, out1=None, group=False): - """Return the partial minimum of an array. - - :Parameters: - - out: 2-`tuple` of `numpy.ndarray` - - out1: 2-`tuple` of `numpy.ndarray`, optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the minimum. - - """ - N, amin = out - - if out1 is not None: - N1, amin1 = out1 - N = psum(N, N1) - amin = pmin(amin, amin1) - - return asanyarray(N, amin) - - -def min_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to the minimum of an - array. - - Here mask out any values derived from a too-small sample size. - - :Parameters: - - sub_samples: optional - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the minimum. - - """ - return mask_where_too_few_values(1, *out) - - -# -------------------------------------------------------------------- -# maximum_absolute_value -# -------------------------------------------------------------------- -def max_abs_f(a, axis=None, masked=False): - """Return the maximum of the absolute array, or the maximum of the - absolute array along an axis. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - masked: bool - - :Returns: - - 2-tuple of numpy arrays - The sample sizes and the maxima of the absolute values. - - """ - return max_f(numpy_abs(a), axis=axis, masked=masked) - - -max_abs_fpartial = max_fpartial -max_abs_ffinalise = max_ffinalise - - -# -------------------------------------------------------------------- -# minimum_absolute_value -# -------------------------------------------------------------------- -def min_abs_f(a, axis=None, masked=False): - """Return the minimum of the absolute array, or the minimum of the - absolute array along an axis. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input is - used. - - masked: `bool` - - :Returns: - - 2-tuple of `numpy.ndarray` - The sample sizes and the minima of the absolute values. - - """ - return min_f(numpy_abs(a), axis=axis, masked=masked) - - -min_abs_fpartial = min_fpartial -min_abs_ffinalise = min_ffinalise - - -# -------------------------------------------------------------------- -# mean -# -------------------------------------------------------------------- -def mean_f(a, axis=None, weights=None, masked=False): - """The weighted average along the specified axes. - - :Parameters: - - a: array-like - Input array. Not all missing data - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. - - masked: `bool`, optional - - :Returns: - - 3-`tuple` of `numpy.ndarray` - The sample size, average and sum of weights inside a 3-tuple. - - """ - a = double_precision(a) - - if masked: - average = numpy_ma_average - else: - average = numpy_average - - avg, sw = average(a, axis=axis, weights=weights, returned=True) - - if not numpy_ndim(avg): - avg = numpy_asanyarray(avg) - sw = numpy_asanyarray(sw) - - if weights is None: - N = sw.copy() - else: - (N,) = sample_size_f(a, axis=axis, masked=masked) - - return asanyarray(N, avg, sw) - - -def mean_fpartial(out, out1=None, group=False): - """Return the partial sample size, the partial sum and partial sum - of the weights. - - :Parameters: - - out: 3-`tuple` of `numpy.ndarray` - Either an output from a previous call to `mean_fpartial`; - or, if *out1* is `None`, an output from `mean_f`. - - out1: 3-`tuple` of `numpy.ndarray`, optional - An output from `mean_f`. - - :Returns: - - 3-`tuple` of `numpy.ndarray` - The sample size, average and sum of weights inside a 3-tuple. - - """ - N, avg, sw = out - - if out1 is None and not group: - # This is the first partition to be processed - - # Convert the partition average to a partition sum - avg *= sw - else: - # Combine this partition with existing parital combination - N1, avg1, sw1 = out1 - - # Convert the partition average to a partition sum - if not group: - avg1 *= sw1 - - N = psum(N, N1) - avg = psum(avg, avg1) # Now a partial sum - sw = psum(sw, sw1) - - return asanyarray(N, avg, sw) - - -def mean_ffinalise(out, sub_samples=None): - """Divide the weighted sum by the sum of weights. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - - out: 3-`tuple` of `numpy.ndarray` - An output from `mean_fpartial`. - - sub_samples: optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the mean. - - """ - N, avg, sw = out - - if sub_samples: - avg /= sw - - return mask_where_too_few_values(1, N, avg) - - -# -------------------------------------------------------------------- -# mean_absolute_value -# -------------------------------------------------------------------- -def mean_abs_f(a, axis=None, weights=None, masked=False): - """Return the mean of the absolute array, or the means of the - absolute array along an axis. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input is - used. - - masked: `bool` - - :Returns: - - 2-tuple of `numpy.ndarray` - The sample sizes and the means of the absolute values. - - """ - return mean_f(numpy_abs(a), axis=axis, weights=weights, masked=masked) - - -mean_abs_fpartial = mean_fpartial -mean_abs_ffinalise = mean_ffinalise - - -# -------------------------------------------------------------------- -# root_mean_square -# -------------------------------------------------------------------- -def root_mean_square_f(a, axis=None, weights=None, masked=False): - """The RMS along the specified axes. - - :Parameters: - - a: array-like - Input array. Not all missing data - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. - - masked: `bool`, optional - - :Returns: - - `tuple` - 3-tuple. - - """ - a = double_precision(a) - - return mean_f(a ** 2, axis=axis, weights=weights, masked=masked) - - -root_mean_square_fpartial = mean_fpartial - - -def root_mean_square_ffinalise(out, sub_samples=None): - """Divide the weighted sum by the sum of weights and take the square - root. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - - out: 3-`tuple` of `numpy.ndarray` - An output from `root_mean_square_fpartial`. - - sub_samples: optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the RMS. - - """ - N, avg = mean_ffinalise(out, sub_samples=sub_samples) - - avg **= 0.5 - - return asanyarray(N, avg) - - -# -------------------------------------------------------------------- -# Mid-range: Average of maximum and minimum -# -------------------------------------------------------------------- -def mid_range_f(a, axis=None, masked=False): - """Return the minimum and maximum of an array or the minimum and - maximum along an axis. - - ``mid_range_f(a, axis=axis)`` is equivalent to ``(numpy.amin(a, - axis=axis), numpy.amax(a, axis=axis))`` - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - kwargs: ignored - - :Returns: - - 3-`tuple` - The sample size, minimum and maximum inside a 3-tuple. - - """ - (N,) = sample_size_f(a, axis=axis, masked=masked) - amin = numpy_amin(a, axis=axis) - amax = numpy_amax(a, axis=axis) - - if not numpy_ndim(amin): - # Make sure that we have a numpy array (as opposed to, e.g. a - # numpy.float64) - amin = numpy_asanyarray(amin) - amax = numpy_asanyarray(amax) - - return asanyarray(N, amin, amax) - - -def mid_range_fpartial(out, out1=None, group=False): - """Return the partial minimum and partial maximum of an array. - - :Parameters: - - out: 3-`tuple` of `numpy.ndarray` - - out1: 3-`tuple` of `numpy.ndarray`, optional - - group: ignored - - :Returns: - - 3-`tuple` - The sample size, minimum and maximum inside a 3-tuple. - - """ - N, amin, amax = out - - if out1 is not None: - N1, amin1, amax1 = out1 - - N = psum(N, N1) - amin = pmin(amin, amin1) - amax = pmax(amax, amax1) - - return asanyarray(N, amin, amax) - - -def mid_range_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to the array mid-range - value. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - - out: ordered sequence - - amin: `numpy.ndarray` - - amax: `numpy.ndarray` - - sub_samples: optional - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the mid-range value. - - """ - N, amin, amax = out - - amax = double_precision(amax) - - # Cast bool, unsigned int, and int to float64 - if issubclass(amax.dtype.type, (numpy_integer, numpy_bool_)): - amax = amax.astype(float) - - # Calculate the mid-range value, storing it in the amax variable - amax += amin - amax *= 0.5 - - return mask_where_too_few_values(1, N, amax) - - -# --------------------------------------------------------------------- -# Range: Absolute difference between maximum and minimum -# --------------------------------------------------------------------- -range_f = mid_range_f -range_fpartial = mid_range_fpartial - - -def range_ffinalise(out, sub_samples=None): - """Absolute difference between maximum and minimum. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - - out: ordered sequence - - sub_samples: optional - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the range. - - """ - N, amin, amax = out - - # Calculate the range value, storing it in the amax variable - amax = double_precision(amax) - amax -= amin - - return mask_where_too_few_values(1, N, amax) - - -# --------------------------------------------------------------------- -# Sample size -# --------------------------------------------------------------------- -def sample_size_f(a, axis=None, masked=False): - """Return the sample size. - - :Parameters: - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - :Returns: - - `numpy.ndarray` - - """ - if masked: - N = numpy_sum(~a.mask, axis=axis, dtype=float) - if not numpy_ndim(N): - N = numpy_asanyarray(N) - else: - if axis is None: - N = numpy_array(a.size, dtype=float) - else: - shape = a.shape - N = numpy_empty(shape[:axis] + shape[axis + 1 :], dtype=float) - N[...] = shape[axis] - # --- End: if - - return asanyarray(N) - - -def sample_size_fpartial(out, out1=None, group=False): - """Return the partial sample size. - - :Parameters: - - out: ordered sequence of one numpy array - - :Returns: - - `numpy.ndarray` - - """ - (N,) = out - if out1 is not None: - (N1,) = out1 - N = psum(N, N1) - - return asanyarray(N) - - -def sample_size_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to give the sample size. - - :Parameters: - - out: ordered sequence of one numpy array - - sub_samples: optional - Ignored. - - :Returns: - - `tuple` - A 2-tuple containing *N* twice. - - """ - (N,) = out - return asanyarray(N, N) - - -# --------------------------------------------------------------------- -# sum -# --------------------------------------------------------------------- -def sum_f(a, axis=None, weights=None, masked=False): - """Return the sum of an array or the sum along an axis. - - ``sum_f(a, axis=axis)`` is equivalent to ``(numpy.sum(a, - axis=axis),)`` - - :Parameters: - - a: numpy array-like - Input array - - weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the sum. - - """ - a = double_precision(a) - - (N,) = sample_size_f(a, axis=axis, masked=masked) - - if weights is not None: - # A weights array has been provided - weights = double_precision(weights) - - if weights.ndim < a.ndim: - weights = broadcast_array(weights, a.shape) - - a = a * weights - - asum = a.sum(axis=axis) - - if not numpy_ndim(asum): - asum = numpy_asanyarray(asum) - - return asanyarray(N, asum) - - -def sum_fpartial(out, out1=None, group=False): - """Return the partial sum of an array. - - :Parameters: - - out: 2-`tuple` of `numpy.ndarray` - - out1: 2-`tuple` of `numpy.ndarray`, optional - - group: *optional* - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the sum. - - """ - N, asum = out - - if out1 is not None: - N1, asum1 = out1 - N = psum(N, N1) - asum = psum(asum, asum1) - - return asanyarray(N, asum) - - -def sum_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to the sum of an array. - - Here mask out any values derived from a too-small sample size. - - :Parameters: - - sub_samples: optional - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the sum. - - """ - return mask_where_too_few_values(1, *out) - - -# --------------------------------------------------------------------- -# sum_of_squares -# --------------------------------------------------------------------- -def sum_of_squares_f(a, axis=None, weights=None, masked=False): - """Return the sum of the square of an array or the sum of squares - along an axis. - - :Parameters: - - a: numpy array-like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - :Returns: - - `tuple` - 2-tuple - - """ - a = double_precision(a) - return sum_f(a ** 2, axis=axis, weights=weights, masked=masked) - - -sum_of_squares_fpartial = sum_fpartial -sum_of_squares_ffinalise = sum_ffinalise - - -# --------------------------------------------------------------------- -# Sum of weights -# --------------------------------------------------------------------- -def sw_f( - a, axis=None, masked=False, weights=None, N=None, sum_of_squares=False -): - """Return the sum of weights for an array. - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - masked: `bool`, optional - - weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. - - N: `numpy.ndarray` - Sample size - - sum_of_squares: delta degrees of freedom - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the sum of weights. - - """ - if N is None: - (N,) = sample_size_f(a, axis=axis, masked=masked) - - if weights is not None: - # A weights array has been provided - weights = double_precision(weights) - - if weights.ndim < a.ndim: - weights = broadcast_array(weights, a.shape) - - if masked: - weights = numpy_ma_array(weights, mask=a.mask, copy=False) - - if sum_of_squares: - weights = weights * weights - - sw = weights.sum(axis=axis) - - if not numpy_ndim(sw): - sw = numpy_asanyarray(sw) - else: - # The sum of weights is equal to the sample size (i.e. an - # unweighted sample) - sw = N.copy() - - return asanyarray(N, sw) - - -sw_fpartial = sum_fpartial -sw_ffinalise = sum_ffinalise - -# --------------------------------------------------------------------- -# Sum of squares of weights -# --------------------------------------------------------------------- -sw2_f = functools_partial(sw_f, sum_of_squares=True) -sw2_fpartial = sum_fpartial -sw2_ffinalise = sum_ffinalise - - -# --------------------------------------------------------------------- -# Variance -# --------------------------------------------------------------------- -def var_f(a, axis=None, weights=None, masked=False, ddof=0): - """Return a tuple containing metrics relating to the array variance. - - The tuple is a 7-tuple that contains, in the order given, the - following variables: - - ======== ============================================================ - Variable Description - ======== ============================================================ - N Sample size - - var Sample variance (ddof=0) - - avg Weighted mean - - V1 Sum of weights - - V2 Sum of squares of weights - - ddof Delta degrees of freedom - - weighted Whether or not the sample is weighted - ======== ============================================================ - - :Parameters: - - a: numpy array_like - Input array - - axis: `int`, optional - Axis along which to operate. By default, flattened input - is used. - - weights: numpy array-like, optional - Weights associated with values of the array. By default the - statistics are unweighted. - - masked: `bool`, optional - - ddof: delta degrees of freedom, optional - - :Returns: - - 7-`tuple` - Tuple containing the value of the statistical metrics described - in the above table, in the given order. - - """ - # Make sure that a is double precision - a = double_precision(a) - - weighted = weights is not None - - # ---------------------------------------------------------------- - # Methods: - # - # http://en.wikipedia.org/wiki/Standard_deviation#Population-based_statistics - # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance - # ---------------------------------------------------------------- - - # Calculate N = number of data points - # Calculate avg = mean of data - # Calculate V1 = sum of weights - N, avg, V1 = mean_f(a, weights=weights, axis=axis, masked=masked) - - # Calculate V2 = sum of squares of weights - if weighted and ddof == 1: - N, V2 = sw2_f(a, axis=axis, masked=masked, weights=weights, N=N) - else: - V2 = None - - if axis is not None and avg.size > 1: - # We collapsed over a single axis and the array has 2 or more - # axes, so add an extra size 1 axis to the mean so that - # broadcasting works when we calculate the variance. - reshape_avg = True - if masked: - expand_dims = numpy_ma_expand_dims - else: - expand_dims = numpy_expand_dims - - avg = expand_dims(avg, axis) - else: - reshape_avg = False - - var = a - avg - var *= var - - if masked: - average = numpy_ma_average - else: - average = numpy_average - - var = average(var, axis=axis, weights=weights) - - if reshape_avg: - shape = avg.shape - avg = avg.reshape(shape[:axis] + shape[axis + 1 :]) - - (N, var, avg, V1, V2) = asanyarray(N, var, avg, V1, V2) - - return (N, var, avg, V1, V2, ddof, weighted) - - -def var_fpartial(out, out1=None, group=False): - """Return a tuple of partial metrics relating to the array variance. - - The tuple is a 7-tuple that contains, in the order given, the - following variables: - - ======== ============================================================ - Variable Description - ======== ============================================================ - N Partial sample size - - var Partial sum of V1*(variance + mean^2) - - avg Unweighted partial sum - - V1 Partial sum of weights - - V2 Partial sum of squares of weights - - ddof Delta degrees of freedom - - weighted Whether or not the population is weighted - ======== ============================================================ - - For further information, see: - https://en.wikipedia.org/wiki/Pooled_variance#Population-based_statistics - - :Parameters: - - out: 7-`tuple` - - out1: 7-`tuple`, optional - - :Returns: - - 7-`tuple` - Tuple containing the value of the statistical metrics described - in the above table, in the given order. - - """ - (N, var, avg, V1, V2, ddof, weighted) = out - - if out1 is None and not group: - # ------------------------------------------------------------ - # var = V1(var+avg**2) - # avg = V1*avg = unweighted partial sum - # ------------------------------------------------------------ - var += avg * avg - var *= V1 - avg *= V1 - else: - # ------------------------------------------------------------ - # var = var + V1b(varb+avgb**2) - # avg = avg + V1b*avgb - # V1 = V1 + V1b - # V2 = V2 + V2b - # ------------------------------------------------------------ - (Nb, varb, avgb, V1b, V2b, ddof, weighted) = out1 - - N = psum(N, Nb) - - if not group: - varb += avgb * avgb - varb *= V1b - avgb *= V1b - - var = psum(var, varb) - avg = psum(avg, avgb) - V1 = psum(V1, V1b) - - if weighted and ddof == 1: - V2 = psum(V2, V2b) - # --- End: if - - (N, var, avg, V1, V2) = asanyarray(N, var, avg, V1, V2) - - return (N, var, avg, V1, V2, ddof, weighted) - - -def var_ffinalise(out, sub_samples=None): - """Calculate the variance of the array and return it with the sample - size. - - Also mask out any values derived from a too-small sample size. - - :Parameters: - - out: 7-`tuple` - - sub_samples: optional - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the variance. - - """ - (N, var, avg, V1, V2, ddof, weighted) = out - - N, var = mask_where_too_few_values(max(2, ddof + 1), N, var) - N, V1 = mask_where_too_few_values(max(2, ddof + 1), N, V1) - if V2 is not None: - N, V2 = mask_where_too_few_values(max(2, ddof + 1), N, V2) - - if sub_samples: - # ---------------------------------------------------------------- - # The global biased variance = {[SUM(pV1(pv+pm**2)]/V1} - m**2 - # - # where pV1 = partial sum of weights - # pv = partial biased variance - # pm = partial mean - # V1 = global sum of weights - # m = global mean - # - # Currently: var = SUM(pV1(pv+pm**2) - # avg = V1*m - # - # https://en.wikipedia.org/wiki/Pooled_variance#Population-based_statistics - # - # For the general case of M non-overlapping data sets, X_{1} - # through X_{M}, and the aggregate data set X=\bigcup_{i}X_{i} - # we have the unweighted mean and variance is: - # - # \mu_{X}={\frac{1}{\sum_{i}{N_{X_{i}}}}}\left(\sum_{i}{N_{X_{i}}\mu_{X_{i}}}\right) - # - # var_{X}={{\frac{1}{\sum_{i}{N_{X_{i}}-ddof}}}\left(\sum_{i}{\left[(N_{X_{i}}-1)\sigma_{X_{i}}^{2}+N_{X_{i}}\mu_{X_{i}}^{2}\right]}-\left[\sum_{i}{N_{X_{i}}}\right]\mu_{X}^{2}\right)} - # - # ---------------------------------------------------------------- - avg /= V1 - avg *= avg - var /= V1 - var -= avg - - # ---------------------------------------------------------------- - # var is now the biased global variance - # ---------------------------------------------------------------- - if not weighted: - if ddof: - # The unweighted variance with N-ddof degrees of freedom is - # [V1/(V1-ddof)]*var. In this case V1 equals the sample size, - # N. ddof=1 provides an unbiased estimator of the variance of - # a hypothetical infinite population. - V1 /= V1 - ddof - var *= V1 - elif ddof == 1: - # Calculate the weighted unbiased variance. The unbiased - # variance weighted with _reliability_ weights is - # [V1**2/(V1**2-V2)]*var. - # - # https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance - V1 **= 2 - var *= V1 - V1 -= V2 - var /= V1 - elif ddof: - raise ValueError( - "Can only calculate a weighted variance with a delta degrees " - "of freedom (ddof) of 0 or 1: Got {}".format(ddof) - ) - - return asanyarray(N, var) - - -# --------------------------------------------------------------------- -# standard_deviation -# --------------------------------------------------------------------- -sd_f = var_f -sd_fpartial = var_fpartial - - -def sd_ffinalise(out, sub_samples=None): - """Apply any logic to finalise the collapse to the standard - deviation. - - :Parameters: - - out: `tuple` - A 2-tuple - - sub_samples: *optional* - Ignored. - - :Returns: - - 2-`tuple` of `numpy.ndarray` - The sample size and the standard deviation. - - """ - N, sd = var_ffinalise(out, sub_samples) - - sd **= 0.5 - - return asanyarray(N, sd) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 6429fd2afe..73dc7aed8e 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -15,6 +15,8 @@ from ..functions import atol as cf_atol from ..functions import rtol as cf_rtol +# from dask.utils import deepmap # Apply function inside nested lists + def _da_ma_allclose(x, y, masked_equal=True, rtol=None, atol=None): """An effective dask.array.ma.allclose method. @@ -260,16 +262,22 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): original array *a*. mtol: number, optional - Set an upper limit of the amount input data values which - are allowed to be missing data when contributing to - individual output percentile values. It is defined as a - fraction (between 0 and 1 inclusive) of the contributing - input data values. The default is 1, meaning that a - missing datum in the output array only occurs when all of - its contributing input array elements are missing data. A - value of 0 means that a missing datum in the output array - occurs whenever any of its contributing input array - elements are missing data. + The sample size threshold below which collapsed values are + set to missing data. It is defined as a fraction (between + 0 and 1 inclusive) of the contributing input data values. + + The default of *mtol* is 1, meaning that a missing datum + in the output array occurs whenever all of its + contributing input array elements are missing data. + + For other values, a missing datum in the output array + occurs whenever more than ``100*mtol%`` of its + contributing input array elements are missing data. + + Note that for non-zero values of *mtol*, different + collapsed elements may have different sample sizes, + depending on the distribution of missing data in the input + data. :Returns: @@ -297,7 +305,7 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): a, axis=axis, keepdims=keepdims ) if n_missing.any(): - mask = np.where(n_missing >= mtol * full_size, True, False) + mask = np.where(n_missing > mtol * full_size, True, False) if q.ndim: mask = np.expand_dims(mask, 0) diff --git a/cf/data/data.py b/cf/data/data.py index 9c030b384c..c076cd114e 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -17,7 +17,6 @@ from dask.base import is_dask_collection, tokenize from dask.core import flatten from dask.highlevelgraph import HighLevelGraph -from numpy.testing import suppress_warnings as numpy_testing_suppress_warnings from ..cfdatetime import dt as cf_dt from ..constants import masked as cf_masked @@ -35,8 +34,6 @@ abspath, ) from ..functions import atol as cf_atol -from ..functions import broadcast_array -from ..functions import chunksize as cf_chunksize from ..functions import default_netCDF_fillvals from ..functions import fm_threshold as cf_fm_threshold from ..functions import free_memory @@ -45,54 +42,8 @@ from ..functions import rtol as cf_rtol from ..mixin_container import Container from ..units import Units -from . import NetCDFArray, UMArray -from .collapse_functions import ( # max_f,; max_ffinalise,; max_fpartial, - max_abs_f, - max_abs_ffinalise, - max_abs_fpartial, - mean_abs_f, - mean_abs_ffinalise, - mean_abs_fpartial, - mean_f, - mean_ffinalise, - mean_fpartial, - mid_range_f, - mid_range_ffinalise, - mid_range_fpartial, - min_abs_f, - min_abs_ffinalise, - min_abs_fpartial, - min_f, - min_ffinalise, - min_fpartial, - range_f, - range_ffinalise, - range_fpartial, - root_mean_square_f, - root_mean_square_ffinalise, - root_mean_square_fpartial, - sample_size_f, - sample_size_ffinalise, - sample_size_fpartial, - sd_f, - sd_ffinalise, - sd_fpartial, - sum_f, - sum_ffinalise, - sum_fpartial, - sum_of_squares_f, - sum_of_squares_ffinalise, - sum_of_squares_fpartial, - sw2_f, - sw2_ffinalise, - sw2_fpartial, - sw_f, - sw_ffinalise, - sw_fpartial, - var_f, - var_ffinalise, - var_fpartial, -) +from . import FileArray, NetCDFArray, UMArray +from .collapse import Collapse from .creation import ( compressed_to_dask, convert_to_builtin_type, @@ -850,19 +801,6 @@ def __float__(self): """ return float(self._get_dask()) - def __round__(self, *ndigits): - """Called to implement the built-in function `round` - - x.__round__(*ndigits) <==> round(x, *ndigits) - - """ - if self.size != 1: - raise TypeError( - "only length-1 arrays can be converted to Python scalars" - ) - - return round(self.datum(), *ndigits) - @daskified(_DASKIFIED_VERBOSE) def __int__(self): """Called to implement the built-in function `int` @@ -947,7 +885,7 @@ def __len__(self): """ dx = self._get_dask() if math.isnan(dx.size): - logger.warning("Computing data len: Performance may be degraded") + logger.debug("Computing data len: Performance may be degraded") dx.compute_chunk_sizes() return len(dx) @@ -1016,7 +954,7 @@ def __getitem__(self, indices): `Data` The subspace of the data. - **Examples:** + **Examples** >>> import numpy >>> d = Data(numpy.arange(100, 190).reshape(1, 10, 9)) @@ -1211,7 +1149,7 @@ def __setitem__(self, indices, value): .. seealso:: `__getitem__`, `__orthogonal_indexing__`, `cf.masked`, `hardmask`, `where` - **Examples:** + **Examples** """ indices, roll = parse_indices( @@ -1546,62 +1484,6 @@ def _del_dask(self, default=ValueError(), delete_source=True): return out - def _map_blocks( - self, func, delete_source=True, reset_mask_hardness=True, **kwargs - ): - """Apply a function to the data in-place. - - .. warning:: **This method **does not reset the mask - hardness**. It may be necessary for a call to - `_map_blocks` to be followed by a call to - `_reset_mask_hardness` (or equivalent). - - .. versionadded:: TODODASK - - :Parameters: - - func: - The function to be applied to the data, via - `dask.array.map_blocks`, to each chunk of the dask - array. - - delete_source: `bool`, optional - If False then do not delete a source array, if one - exists, after applying the function. By default a - source array is deleted. - - reset_mask_hardness: `bool`, optional - If False then do not reset the mask hardness after - applying the function. By default the mask hardness is - re-applied, even if the mask hardness has not changed. - - kwargs: optional - Keyword arguments passed to the - `dask.array.map_blocks` method. - - :Returns: - - `dask.array.Array` - The updated dask array. - - **Examples** - - >>> d = cf.Data([1, 2, 3]) - >>> dx = d._map_blocks(lambda x: x / 2) - >>> print(d.array) - [0.5 1. 1.5] - - """ - dx = self.get_dask(copy=False) - dx = dx.map_blocks(func, **kwargs) - self._set_dask( - dx, - delete_source=delete_source, - reset_mask_hardness=reset_mask_hardness, - ) - - return dx - def _reset_mask_hardness(self): """Re-apply the mask hardness to the dask array. @@ -2033,9 +1915,47 @@ def median( squeeze=False, mtol=1, inplace=False, - _preserve_partitions=False, ): - """Compute the median of the values.""" + """Calculate median values. + + Calculates the median value or the median values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `mean_of_upper_decile`, `percentile` + + :Parameters: + + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{inplace: `bool`, optional}} + + :Returns: + + `Data` or `None` + The collapsed data, or `None` if the operation was + in-place. + + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2]) + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.median() + + + """ return self.percentile( 50, axes=axes, @@ -2048,19 +1968,68 @@ def median( def mean_of_upper_decile( self, axes=None, - include_decile=True, - squeeze=False, weights=None, + squeeze=False, mtol=1, + include_decile=True, + split_every=None, inplace=False, - _preserve_partitions=False, ): - """Compute the mean the of upper decile. + """Calculate means of the upper deciles. + + Calculates the mean of the upper decile or the mean or the + mean of the upper decile values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `mean`, `median`, `percentile` + + :Parameters: + + {{collapse axes: (sequence of) `int`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} + + TODODASK - note that weights only applies to the + calculation of the mean, not the upper + decile. + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + TODODASK - note that mtol only applies to the + calculation of the upper decile, not the + mean. - Specifically, calculate the mean of the upper group of data - values defined by the upper tenth of their distribution. + include_decile: `bool`, optional + TODODASK + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + + {{inplace: `bool`, optional}} + + :Returns: + + `Data` or `None` + The collapsed data, or `None` if the operation was + in-place. + + **Examples** + + TODODASK """ + + # TODODASK: Some updates off the back of daskifying collapse + # have been done, but still needs looking at. A unit + # test has also been written, but not run. Needs + # __lt__ and __le__. + d = _inplace_enabled_define_and_cleanup(self) p90 = d.percentile( @@ -2069,10 +2038,9 @@ def mean_of_upper_decile( squeeze=False, mtol=mtol, inplace=False, - _preserve_partitions=_preserve_partitions, ) - with numpy_testing_suppress_warnings() as sup: + with np.testing.suppress_warnings() as sup: sup.filter( RuntimeWarning, message=".*invalid value encountered in less.*" ) @@ -2080,7 +2048,6 @@ def mean_of_upper_decile( mask = d < p90 else: mask = d <= p90 - # --- End: with if mtol < 1: mask.filled(False, inplace=True) @@ -2089,10 +2056,11 @@ def mean_of_upper_decile( d.mean( axes=axes, - squeeze=squeeze, weights=weights, + squeeze=squeeze, + mtol=1, + split_every=split_every, inplace=True, - _preserve_partitions=_preserve_partitions, ) return d @@ -2204,23 +2172,11 @@ def percentile( is guaranteed to broadcast correctly against the original data. - mtol: number, optional - Set an upper limit of the amount input data values - which are allowed to be missing data when contributing - to individual output percentile values. It is defined - as a fraction (between 0 and 1 inclusive) of the - contributing input data values. The default is 1, - meaning that a missing datum in the output array only - occurs when all of its contributing input array - elements are missing data. A value of 0 means that a - missing datum in the output array occurs whenever any - of its contributing input array elements are missing - data. + {{mtol: number, optional}} - *Parameter example:* - To ensure that an output array element is a missing - value if more than 25% of its input array elements - are missing data: ``mtol=0.25``. + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -2259,7 +2215,7 @@ def percentile( [[-- -- -- --] [-- -- -- --] [-- 9 10 11]] - >>> e.sd() + >>> e.std() Find the mean of the values above the 45th percentile along the @@ -2430,6 +2386,8 @@ def persist(self, inplace=False): **Examples** + TODODASK + """ d = _inplace_enabled_define_and_cleanup(self) @@ -2496,7 +2454,7 @@ def dumpd(self): `dict` The serialization. - **Examples:** + **Examples** >>> d = cf.Data([[1, 2, 3]], 'm') >>> d.dumpd() @@ -2684,7 +2642,7 @@ def loadd(self, d, chunk=True): `None` - **Examples:** + **Examples** >>> d = Data([[1, 2, 3]], 'm') >>> e = Data([6, 7, 8, 9], 's') @@ -2976,7 +2934,7 @@ def ceil(self, inplace=False, i=False): The ceiling of the data. If the operation was in-place then `None` is returned. - **Examples:** + **Examples** >>> d = cf.Data([-1.9, -1.5, -1.1, -1, 0, 1, 1.1, 1.5 , 1.9]) >>> print(d.array) @@ -3466,7 +3424,9 @@ def _asdatetime(self, inplace=False): ) if not d._isdatetime(): - d._map_blocks(cf_rt2dt, units=units, dtype=object) + dx = d.to_dask_array() + dx = dx.map_blocks(cf_rt2dt, units=units, dtype=object) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -3521,7 +3481,9 @@ def _asreftime(self, inplace=False): ) if d._isdatetime(): - d._map_blocks(cf_dt2rt, units=units, dtype=float) + dx = d.to_dask_array() + dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -3540,7 +3502,7 @@ def _combined_units(self, data1, method, inplace): `Data` or `None`, `Data` or `None`, `Units` - **Examples:** + **Examples** >>> d._combined_units(e, '__sub__') >>> d._combined_units(e, '__imul__') @@ -3936,7 +3898,7 @@ def _binary_operation(self, other, method): A new data object, or if the operation was in place, the same data object. - **Examples:** + **Examples** >>> d = cf.Data([0, 1, 2, 3]) >>> e = cf.Data([1, 1, 3, 4]) @@ -4328,7 +4290,7 @@ def concatenate(cls, data, axis=0, _preserve=True): `Data` The concatenated data. - **Examples:** + **Examples** >>> d = cf.Data([[1, 2], [3, 4]], 'km') >>> e = cf.Data([[5.0, 6.0]], 'metre') @@ -4638,7 +4600,7 @@ def _unary_operation(self, operation): `Data` A new Data array. - **Examples:** + **Examples** >>> d = cf.Data([[1, 2, -3, -4, -5]]) @@ -5085,898 +5047,131 @@ def __pos__(self): """ return self._unary_operation("__pos__") - @_deprecated_kwarg_check("i") - @_inplace_enabled(default=False) - def _collapse( - self, - func, - fpartial, - ffinalise, - axes=None, - squeeze=False, - weights=None, - mtol=1, - units=None, - inplace=False, - i=False, - _preserve_partitions=False, - **kwargs, - ): - """Collapse the data. - - :Parameters: - - func: function - - fpartial: function - - ffinalise: function + # ---------------------------------------------------------------- + # Private attributes + # ---------------------------------------------------------------- + @property + def _Units(self): + """Storage for the units. - axes: (sequence of) `int`, optional - The axes to be collapsed. By default flattened input is - used. Each axis is identified by its integer position. No - axes are collapsed if *axes* is an empty sequence. + The units are stored in a `Units` object, and reflect the + units of the (yet to be computed) elements of the underlying + data. - squeeze: `bool`, optional - If False then the axes which are collapsed are left in the - result as axes with size 1. In this case the result will - broadcast correctly against the original array. By default - collapsed axes are removed. + .. warning:: Assigning to `_Units` does *not* trigger a units + conversion of the underlying data + values. Therefore assigning to `_Units` should + only be done in cases when it is known that the + intrinsic units represented by the data values + are inconsistent with the existing value of + `_Units`. Before assigning to `_Units`, first + consider if assigning to `Units`, or calling the + `override_units` or `override_calendar` method is + a more appropriate course of action, and use one + of those if possible. - weights: *optional* + """ + return self._custom["_Units"] - {{inplace: `bool`, optional}} + @_Units.setter + def _Units(self, value): + self._custom["_Units"] = value - {{i: deprecated at version 3.0.0}} + @_Units.deleter + def _Units(self): + self._custom["_Units"] = _units_None - _preserve_partitions: `bool`, optional - If True then preserve the shape of the partition matrix of - the input data, at the expense of a much slower - execution. By default, the partition matrix may be reduced - (using `varray`) to considerably speed things up. + @property + def _cyclic(self): + """Storage for axis cyclicity. - kwargs: *optional* + Contains a `set` that identifies which axes are cyclic (and + therefore allow cyclic slicing). The set contains a subset of + the axis identifiers defined by the `_axes` attribute. - :Returns: + .. warning:: Never change the value of the `_cyclic` attribute + in-place. - `Data` or `None` + .. note:: When an axis identifier is removed from the `_axes` + attribute then it is automatically also removed from + the `_cyclic` attribute. """ - d = _inplace_enabled_define_and_cleanup(self) - ndim = d._ndim - self_axes = d._axes - self_shape = d._shape - - original_self_axes = self_axes[:] - - if axes is None: - # Collapse all axes - axes = list(range(ndim)) - n_collapse_axes = ndim - n_non_collapse_axes = 0 - Nmax = d._size - elif not axes and axes != 0: - # Collapse no axes - if inplace: - d = None - return d - else: - # Collapse some (maybe all) axes - axes = sorted(d._parse_axes(axes)) # , '_collapse')) - n_collapse_axes = len(axes) - n_non_collapse_axes = ndim - n_collapse_axes - Nmax = 1 - for i in axes: - Nmax *= self_shape[i] - # --- End: if + return self._custom["_cyclic"] - # ------------------------------------------------------------- - # Parse the weights. - # - # * Change the keys from dimension names to the integer - # positions of the dimensions. - # - # * Make sure all non-null weights are Data objects. - # ------------------------------------------------------------ - if weights is not None: - if not isinstance(weights, dict): - # If the shape of the weights is not the same as the - # shape of the data array then the weights are assumed - # to span the collapse axes in the order in which they - # are given - if np.shape(weights) == self_shape: - weights = {tuple(self_axes): weights} - else: - weights = {tuple([self_axes[i] for i in axes]): weights} + @_cyclic.setter + def _cyclic(self, value): + self._custom["_cyclic"] = value - else: - weights = weights.copy() - weights_axes = set() - for key, value in tuple(weights.items()): - del weights[key] - key = d._parse_axes(key) - if weights_axes.intersection(key): - raise ValueError("Duplicate weights axis") - - weights_axes.update(key) - weights[tuple([self_axes[i] for i in key])] = value - # --- End: for + @_cyclic.deleter + def _cyclic(self): + self._custom["_cyclic"] = _empty_set - if not weights_axes.intersection(axes): - # Ignore all of the weights if none of them span - # any collapse axes - weights = {} - # --- End: if + @property + @daskified(_DASKIFIED_VERBOSE) + def _hardmask(self): + """Storage for the mask hardness. - for key, weight in tuple(weights.items()): - if weight is None or np.size(weight) == 1: - # Ignore undefined weights and size 1 weights - del weights[key] - continue + Contains a `bool`, where `True` denotes a hard mask and + `False` denotes a soft mask. - weight_ndim = np.ndim(weight) - if weight_ndim != len(key): - raise ValueError( - "Can't collapse: Incorrect number of weights " - "axes (%d != %d)" % (weight.ndim, len(key)) - ) + .. warning:: Assigning to `_hardmask` does *not* trigger a + hardening or softening of the mask of the + underlying data values. Therefore assigning to + `_hardmask` should only be done in cases when it + is known that the intrinsic mask hardness of the + data values is inconsistent with the + existing value of `_hardmask`. Before assigning + to `_hardmask`, first consider if assigning to + `hardmask`, or calling the `harden_mask` or + `soften_mask` method is a more appropriate course + of action, and use one of those if possible. - if weight_ndim > ndim: - raise ValueError( - "Can't collapse: Incorrect number of weights " - "axes (%d > %d)" % (weight.ndim, ndim) - ) + See `hardmask` for details. - for n, axis in zip(np.shape(weight), key): - if n != self_shape[self_axes.index(axis)]: - raise ValueError( - "Can't collapse: Incorrect weights " - "shape {!r}".format(np.shape(weight)) - ) - # --- End: for + """ + return self._custom["_hardmask"] - # Convert weight to a data object, if necessary. - weight = type(self).asdata(weight) + @_hardmask.setter + def _hardmask(self, value): + self._custom["_hardmask"] = value - if weight.dtype.char in ("S", "U"): - # Ignore string-valued weights - del weights[key] - continue + @property + @daskified(_DASKIFIED_VERBOSE) + def _axes(self): + """Storage for the axis identifiers. - weights[key] = weight - # --- End: for - # --- End: if + Contains a `tuple` of identifiers, one for each array axis. - if axes != list(range(n_non_collapse_axes, ndim)): - transpose_iaxes = [i for i in range(ndim) if i not in axes] + axes - d.transpose(transpose_iaxes, inplace=True) + .. note:: When the axis identifiers are reset, then any axis + identifier named by the `_cyclic` attribute which is + not in the new `_axes` set is automatically removed + from the `_cyclic` attribute. - if weights: - # Optimize when weights span only non-partitioned axes - # (do this before permuting the order of the weight - # axes to be consistent with the order of the data - # axes) - weights = d._collapse_optimize_weights(weights) + """ + return self._custom["_axes"] - # Permute the order of the weight axes to be - # consistent with the order of the data axes - self_axes = d._axes - for key, w in tuple(weights.items()): - key1 = tuple([axis for axis in self_axes if axis in key]) + @_axes.setter + def _axes(self, value): + self._custom["_axes"] = tuple(value) - if key1 != key: - w = w.transpose([key.index(axis) for axis in key1]) - - del weights[key] - ikey = tuple([self_axes.index(axis) for axis in key1]) - - weights[ikey] = w - - # Add the weights to kwargs - kwargs["weights"] = weights - - # If the input data array 'fits' in one chunk of memory, then - # make sure that it has only one partition - if ( - not _preserve_partitions - and d._pmndim - and d.fits_in_one_chunk_in_memory(d.dtype.itemsize) - ): - d.varray - - # ------------------------------------------------------------- - # Initialise the output data array - # ------------------------------------------------------------- - new = d[(Ellipsis,) + (0,) * n_collapse_axes] - - # new._auxiliary_mask = None - for partition in new.partitions.matrix.flat: - # Do this so as not to upset the ref count on the - # parittion's of d - del partition.subarray - - # d.to_memory() - - # save = not new.fits_in_memory(new.dtype.itemsize) - keep_in_memory = new.fits_in_memory(new.dtype.itemsize) - - datatype = d.dtype - - if units is None: - new_units = new.Units - else: - new_units = units - - p_axes = new._axes[:n_non_collapse_axes] - p_units = new_units - - c_slice = (slice(None),) * n_collapse_axes - - config = new.partition_configuration( - readonly=False, auxiliary_mask=None, extra_memory=False # DCH ??x - ) - - processed_partitions = [] - for pmindex, partition in np.ndenumerate(new.partitions.matrix): - if partition._process_partition: - # Only process the partition if it is flagged - partition.open(config) - - # Save the position of the partition in the partition - # matrix - partition._pmindex = pmindex - - partition.axes = p_axes - partition.flip = [] - partition.part = [] - partition.Units = p_units - - if squeeze: - # Note: parentheses for line continuation (not a tuple): - partition.location = partition.location[ - :n_non_collapse_axes - ] - partition.shape = partition.shape[:n_non_collapse_axes] - - indices = partition.indices[:n_non_collapse_axes] + c_slice - - partition.subarray = d._collapse_subspace( - func, - fpartial, - ffinalise, - indices, - n_non_collapse_axes, - n_collapse_axes, - Nmax, - mtol, - _preserve_partitions=_preserve_partitions, - _parallelise_collapse_subspace=False, - **kwargs, - ) - - partition.close(keep_in_memory=keep_in_memory) - - # Add each partition to a list of processed partitions - processed_partitions.append(partition) - # --- End: if - # --- End: for - - # processed_partitions contains a list of all the partitions - # that have been processed on this rank. In the serial case - # this is all of them and this line of code has no - # effect. Otherwise the processed partitions from each rank - # are distributed to every rank and processed_partitions now - # contains all the processed partitions from every rank. - processed_partitions = self._share_partitions( - processed_partitions, False - ) - - # Put the processed partitions back in the partition matrix - # according to each partitions _pmindex attribute set above. - pm = new.partitions.matrix - for partition in processed_partitions: - pm[partition._pmindex] = partition - - p_datatype = partition.subarray.dtype - if datatype != p_datatype: - datatype = np.result_type(p_datatype, datatype) - # --- End: for - - new._Units = new_units - new.dtype = datatype - - if squeeze: - new._axes = p_axes - new._ndim = ndim - n_collapse_axes - new._shape = new._shape[: new._ndim] - else: - new_axes = new._axes - if new_axes != original_self_axes: - iaxes = [new_axes.index(axis) for axis in original_self_axes] - new.transpose(iaxes, inplace=True) - # --- End: if - - # ------------------------------------------------------------ - # Update d in place and return - # ------------------------------------------------------------ - d.__dict__ = new.__dict__ - - return d - - def _collapse_subspace( - self, - func, - fpartial, - ffinalise, - indices, - n_non_collapse_axes, - n_collapse_axes, - Nmax, - mtol, - weights=None, - _preserve_partitions=False, - _parallelise_collapse_subspace=True, - **kwargs, - ): - """Collapse a subspace of a data array. - - If set, *weights* and *kwargs* are passed to the function call. If - there is a *weights* keyword argument then this should either evaluate - to False or be a dictionary of weights for at least one of the data - dimensions. - - :Parameters: - - func : function - - fpartial : function - - ffinalise : function - - indices: tuple - The indices of the master array which would create the - subspace. - - n_non_collapse_axes : int - The number of data array axes which are not being - collapsed. It is assumed that they are in the slowest moving - positions. - - n_collapse_axes : int - The number of data array axes which are being collapsed. It is - assumed that they are in the fastest moving positions. - - weights : dict, optional - - kwargs : *optional* - - :Returns: - - `list` - - **Examples:** - - """ - - ndim = self._ndim - - master_shape = self.shape - - data = self[indices] - - # If the input data array 'fits' in one chunk of memory, then - # make sure that it has only one partition - if ( - not _preserve_partitions - and data._pmndim - and data.fits_in_memory(data.dtype.itemsize) - ): - data.varray - - # True iff at least two, but not all, axes are to be - # collapsed. - reshape = 1 < n_collapse_axes < ndim - - out = None - - if n_collapse_axes == ndim: - # All axes are to be collapsed - kwargs.pop("axis", None) - else: - # At least one axis, but not all axes, are to be - # collapsed. It is assumed that the collapse axes are in - # the last (fastest varying) positions (-1, -2, ...). We - # set kwargs['axis']=-1 (actually we use the +ve integer - # equivalent of -1) if there is more then one collapse - # axis because, in this case (i.e. reshape is True), we - # will reshape everything. - kwargs["axis"] = ndim - n_collapse_axes - - masked = False - - sub_samples = 0 - - # pda_args = data.pda_args(revert_to_file=True) #, readonly=True) - config = data.partition_configuration(readonly=True) - - # Flag which partitions will be processed on this rank. If - # _parallelise_collapse_subspace is False then all partitions - # will be flagged for processing. - data._flag_partitions_for_processing(_parallelise_collapse_subspace) - - for i, partition in enumerate(data.partitions.matrix.flat): - if partition._process_partition: - # Only process a partition if flagged - partition.open(config) - array = partition.array - - p_masked = partition.masked - - if p_masked: - masked = True - if array.mask.all(): - # The array is all missing data - partition.close() - continue - - # Still here? Then there are some non-missing sub-array - # elements. - if weights is not None: - w = self._collapse_create_weights( - array, - partition.indices, - indices, - master_shape, - weights, - n_non_collapse_axes, - n_collapse_axes, - ) - wmin = w.min() - if wmin < 0: - raise ValueError( - "Can't collapse with negative weights" - ) - - if wmin == 0: - # Mask the array where the weights are zero - array = np.ma.masked_where(w == 0, array, copy=True) - if array.mask.all(): - # The array is all missing data - partition.close() - continue - # --- End: if - - kwargs["weights"] = w - # --- End: if - - partition.close() - - if reshape: - # At least two, but not all, axes are to be collapsed - # => we need to reshape the array and the weights. - shape = array.shape - ndim = array.ndim - new_shape = shape[:n_non_collapse_axes] - new_shape += (reduce(mul, shape[n_non_collapse_axes:]),) - array = np.reshape(array.copy(), new_shape) - - if weights is not None: - w = kwargs["weights"] - if w.ndim < ndim: - # The weights span only collapse axes (as - # opposed to spanning all axes) - new_shape = (w.size,) - - kwargs["weights"] = np.reshape(w, new_shape) - # --- End: if - - p_out = func(array, masked=p_masked, **kwargs) - - if out is None: - if ( - not _parallelise_collapse_subspace - and data.partitions.size == i + 1 - ): - # There is exactly one partition so we are done - out = p_out - break - # --- End: if - out = fpartial(p_out) - else: - out = fpartial(out, p_out) - # --- End: if - - sub_samples += 1 - - # --- End: if - # --- End: for - - # In the case that the inner loop is not parallelised, - # just finalise. - out = self._collapse_finalise( - ffinalise, - out, - sub_samples, - masked, - Nmax, - mtol, - data, - n_non_collapse_axes, - ) - # # --- End: if - - return out - - @classmethod - def _collapse_finalise( - cls, - ffinalise, - out, - sub_samples, - masked, - Nmax, - mtol, - data, - n_non_collapse_axes, - ): - """Finalise a collapse over a data array.""" - if out is not None: - # Finalise - N, out = ffinalise(out, sub_samples) - out = cls._collapse_mask(out, masked, N, Nmax, mtol) - else: - # no data - return all masked - out = np.ma.masked_all( - data.shape[:n_non_collapse_axes], data.dtype - ) - - return out - - @staticmethod - def _collapse_mask(array, masked, N, Nmax, mtol): - """Re-masks a masked array to reflect a collapse. - - :Parameters: - - array: numpy array - - masked: bool - - N: numpy array-like - - Nmax: int - - mtol: numpy array-like - - :Returns: - - numpy array - - """ - if masked and mtol < 1: - x = N < (1 - mtol) * Nmax - if x.any(): - array = np.ma.masked_where(x, array, copy=False) - # --- End: if - - return array - - @staticmethod - def _collapse_create_weights( - array, - indices, - master_indices, - master_shape, - master_weights, - n_non_collapse_axes, - n_collapse_axes, - ): - """Collapse weights of an array. - - :Parameters: - - array : numpy array - - indices : tuple - - master_indices : tuple - - master_shape : tuple - - master_weights : dict - - n_non_collapse_axes : int - The number of array axes which are not being collapsed. It - is assumed that they are in the slowest moving positions. - - n_collapse_axes : int - The number of array axes which are being collapsed. It is - assumed that they are in the fastest moving positions. - - :Returns: - - `numpy array or `None` - - **Examples:** - - """ - array_shape = array.shape - array_ndim = array.ndim - - weights_indices = [] - for master_index, index, size in zip( - master_indices, indices, master_shape - ): - start, stop, step = master_index.indices(size) - - size1, mod = divmod(stop - start - 1, step) - - start1, stop1, step1 = index.indices(size1 + 1) - - size2, mod = divmod(stop1 - start1, step1) - - if mod != 0: - size2 += 1 - - start += start1 * step - step *= step1 - stop = start + (size2 - 1) * step + 1 - - weights_indices.append(slice(start, stop, step)) - - base_shape = (1,) * array_ndim - - masked = False - zero_weights = False - - weights = [] - for key, weight in master_weights.items(): - shape = list(base_shape) - index = [] - for i in key: - shape[i] = array_shape[i] - index.append(weights_indices[i]) - - weight = weight[tuple(index)].array - - zero_weights = zero_weights or (weight.min() <= 0) - - masked = masked or np.ma.isMA(weight) - - if weight.ndim != array_ndim: - # Make sure that the weight has the same number of - # dimensions as the array - weight = weight.reshape(shape) - - weights.append(weight) - - weights_out = weights[0] - - if len(weights) > 1: - # There are two or more weights, so create their product - # (can't do this in-place because of broadcasting woe) - for w in weights[1:]: - weights_out = weights_out * w - - weights_out_shape = weights_out.shape - - if ( - not masked - and weights_out_shape[:n_non_collapse_axes] - == base_shape[:n_non_collapse_axes] - ): - # The input weights are not masked and only span collapse axes - weights_out = weights_out.reshape( - weights_out_shape[n_non_collapse_axes:] - ) - - if ( - weights_out_shape[n_non_collapse_axes:] - != array_shape[n_non_collapse_axes:] - ): - # The input weights span some, but not all, of the - # collapse axes, so broadcast the weights over all - # collapse axes - weights_out = broadcast_array( - weights_out, array_shape[n_non_collapse_axes:] - ) - else: - if weights_out_shape != array_shape: - # Either a) The input weights span at least one - # non-collapse axis, so broadcast the weights over all - # axes or b) The weights contain masked values - weights_out = broadcast_array(weights_out, array_shape) - - if masked and np.ma.isMA(array): - if not (array.mask | weights_out.mask == array.mask).all(): - raise ValueError( - "The output weights mask {} is not compatible with " - "the array mask {}.".format( - weights_out.mask, array.mask - ) - ) - # --- End: if - - return weights_out - - def _collapse_optimize_weights(self, weights): - """Optimise when weights span only non-partitioned axes. - - weights: `dict` - - """ - non_partitioned_axes = set(self._axes).difference(self._pmaxes) - - x = [] - new_key = () - for key in weights: - if non_partitioned_axes.issuperset(key): - x.append(key) - new_key += key - # --- End: for - - if len(x) > 1: - reshaped_weights = [] - for key in x: - w = weights.pop(key) - w = w.array - shape = [ - (w.shape[key.index(axis)] if axis in key else 1) - for axis in new_key - ] - w = w.reshape(shape) - - reshaped_weights.append(w) - - # Create their product - new_weight = reshaped_weights[0] - for w in reshaped_weights[1:]: - new_weight = new_weight * w - - weights[new_key] = type(self)(new_weight) - - return weights - - # ---------------------------------------------------------------- - # Private attributes - # ---------------------------------------------------------------- - @property - def _Units(self): - """Storage for the units. - - The units are stored in a `Units` object, and reflect the - units of the (yet to be computed) elements of the underlying - data. - - .. warning:: Assigning to `_Units` does *not* trigger a units - conversion of the underlying data - values. Therefore assigning to `_Units` should - only be done in cases when it is known that the - intrinsic units represented by the data values - are inconsistent with the existing value of - `_Units`. Before assigning to `_Units`, first - consider if assigning to `Units`, or calling the - `override_units` or `override_calendar` method is - a more appropriate course of action, and use one - of those if possible. - - """ - return self._custom["_Units"] - - @_Units.setter - def _Units(self, value): - self._custom["_Units"] = value - - @_Units.deleter - def _Units(self): - self._custom["_Units"] = _units_None - - @property - def _cyclic(self): - """Storage for axis cyclicity. - - Contains a `set` that identifies which axes are cyclic (and - therefore allow cyclic slicing). The set contains a subset of - the axis identifiers defined by the `_axes` attribute. - - .. warning:: Never change the value of the `_cyclic` attribute - in-place. - - .. note:: When an axis identifier is removed from the `_axes` - attribute then it is automatically also removed from - the `_cyclic` attribute. - - """ - return self._custom["_cyclic"] - - @_cyclic.setter - def _cyclic(self, value): - self._custom["_cyclic"] = value - - @_cyclic.deleter - def _cyclic(self): - self._custom["_cyclic"] = _empty_set - - @property - @daskified(_DASKIFIED_VERBOSE) - def _hardmask(self): - """Storage for the mask hardness. - - Contains a `bool`, where `True` denotes a hard mask and - `False` denotes a soft mask. - - .. warning:: Assigning to `_hardmask` does *not* trigger a - hardening or softening of the mask of the - underlying data values. Therefore assigning to - `_hardmask` should only be done in cases when it - is known that the intrinsic mask hardness of the - data values is inconsistent with the - existing value of `_hardmask`. Before assigning - to `_hardmask`, first consider if assigning to - `hardmask`, or calling the `harden_mask` or - `soften_mask` method is a more appropriate course - of action, and use one of those if possible. - - See `hardmask` for details. - - """ - return self._custom["_hardmask"] - - @_hardmask.setter - def _hardmask(self, value): - self._custom["_hardmask"] = value - - @property - @daskified(_DASKIFIED_VERBOSE) - def _axes(self): - """Storage for the axis identifiers. - - Contains a `tuple` of identifiers, one for each array axis. - - .. note:: When the axis identifiers are reset, then any axis - identifier named by the `_cyclic` attribute which is - not in the new `_axes` set is automatically removed - from the `_cyclic` attribute. - - """ - return self._custom["_axes"] - - @_axes.setter - def _axes(self, value): - self._custom["_axes"] = tuple(value) - - # Remove cyclic axes that are not in the new axes - cyclic = self._cyclic - if cyclic: - # Never change the value of the _cyclic attribute in-place - self._cyclic = cyclic.intersection(value) + # Remove cyclic axes that are not in the new axes + cyclic = self._cyclic + if cyclic: + # Never change the value of the _cyclic attribute in-place + self._cyclic = cyclic.intersection(value) # ---------------------------------------------------------------- # Dask attributes # ---------------------------------------------------------------- @property - @daskified(_DASKIFIED_VERBOSE) def chunks(self): - """The chunk sizes for each dimension. - - **Examples** - - >>> d = cf.Data.ones((4, 5), chunks=(2, 4)) - >>> d.chunks - ((2, 2), (4, 1)) - - """ + """TODODASK.""" return self._get_dask().chunks - @chunks.setter - def chunks(self, chunks): - raise TypeError( - "Can't set chunks directly. Use the 'rechunk' method instead." - ) - @property def force_compute(self): - """TODODASK See also confg settings.""" + """TODODASK See also config settings.""" return self._custom.get("force_compute", False) @force_compute.setter @@ -5995,7 +5190,7 @@ def Units(self): .. seealso `override_units`, `override_calendar` - **Examples:** + **Examples** >>> d = cf.Data([1, 2, 3], units='m') >>> d.Units @@ -6043,12 +5238,9 @@ def cf_Units(x): x=x, from_units=old_units, to_units=value, inplace=False ) - self._map_blocks( - cf_Units, - delete_source=False, - reset_mask_hardness=False, - dtype=dtype, - ) + dx = self.to_dask_array() + dx = dx.map_blocks(cf_Units, dtype=dtype) + self._set_dask(dx, reset_mask_hardness=False) self._Units = value @@ -6078,7 +5270,7 @@ def data(self): def dtype(self): """The `numpy` data-type of the data. - **Examples:** + **Examples** TODODASK >>> d = cf.Data([0.5, 1.5, 2.5]) @@ -6121,6 +5313,7 @@ def dtype(self, value): self._set_dask(dx, reset_mask_hardness=False) @property + @daskified(_DASKIFIED_VERBOSE) def fill_value(self): """The data array missing data value. @@ -6130,7 +5323,7 @@ def fill_value(self): Deleting this attribute is equivalent to setting it to None, so this attribute is guaranteed to always exist. - **Examples:** + **Examples** >>> d.fill_value = 9999.0 >>> d.fill_value @@ -6173,7 +5366,7 @@ def hardmask(self): .. seealso:: `harden_mask`, `soften_mask`, `where`, `__setitem__` - **Examples:** + **Examples** >>> d = cf.Data([1, 2, 3]) >>> d.hardmask @@ -6245,7 +5438,7 @@ def is_masked(a): def isscalar(self): """True if the data array is a 0-d scalar array. - **Examples:** + **Examples** >>> d.ndim 0 @@ -6272,7 +5465,7 @@ def nbytes(self): If the number of bytes is unknown then it is calculated immediately by executing all delayed operations. - **Examples:** + **Examples** >>> d = cf.Data([[1, 1.5, 2]]) >>> d.dtype @@ -6290,7 +5483,7 @@ def nbytes(self): """ dx = self.get_dask(copy=False) if math.isnan(dx.size): - logger.warning("Computing nbytes: Performance may be degraded") + logger.debug("Computing data nbytes: Performance may be degraded") dx.compute_chunk_sizes() return dx.nbytes @@ -6300,7 +5493,7 @@ def nbytes(self): def ndim(self): """Number of dimensions in the data array. - **Examples:** + **Examples** >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) >>> d.ndim @@ -6336,7 +5529,7 @@ def shape(self): If the shape of the data is unknown then it is calculated immediately by executing all delayed operations. - **Examples:** + **Examples** >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) >>> d.shape @@ -6357,7 +5550,7 @@ def shape(self): """ dx = self.get_dask(copy=False) if math.isnan(dx.size): - logger.warning("Computing data shape: Performance may be degraded") + logger.debug("Computing data shape: Performance may be degraded") dx.compute_chunk_sizes() return dx.shape @@ -6372,7 +5565,7 @@ def size(self): If the size of the data is unknown then it is calculated immediately by executing all delayed operations. - **Examples:** + **Examples** >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) >>> d.size @@ -6398,7 +5591,7 @@ def size(self): dx = self.get_dask(copy=False) size = dx.size if math.isnan(size): - logger.warning("Computing data size: Performance may be degraded") + logger.debug("Computing data size: Performance may be degraded") dx.compute_chunk_sizes() size = dx.size @@ -6547,7 +5740,7 @@ def mask(self): `Data` - **Examples:** + **Examples** >>> d.shape (12, 73, 96) @@ -6604,7 +5797,7 @@ def mask_fpe(*arg): The behaviour prior to the change, or the current behaviour if no new value was specified. - **Examples:** + **Examples** >>> d = cf.Data([0., 1]) >>> e = cf.Data([1., 2]) @@ -6714,7 +5907,7 @@ def seterr(all=None, divide=None, over=None, under=None, invalid=None): The behaviour prior to the change, or the current behaviour if no new values are specified. - **Examples:** + **Examples** Set treatment for all types of floating-point errors to ``'raise'`` and then reset to the previous behaviours: @@ -6835,7 +6028,7 @@ def arctan(self, inplace=False): `Data` or `None` - **Examples:** + **Examples** >>> print(d.array) [[0.5 0.7] @@ -6900,7 +6093,7 @@ def arctan(self, inplace=False): # # `Data` # - # **Examples:** + # **Examples** # # ''' # return cls(numpy_arctan2(y, x), units=_units_radians) @@ -6924,7 +6117,7 @@ def arctanh(self, inplace=False): `Data` or `None` - **Examples:** + **Examples** >>> print(d.array) [[0.5 0.7] @@ -6979,7 +6172,7 @@ def arcsin(self, inplace=False): `Data` or `None` - **Examples:** + **Examples** >>> print(d.array) [[0.5 0.7] @@ -7034,7 +6227,7 @@ def arcsinh(self, inplace=False): `Data` or `None` - **Examples:** + **Examples** >>> print(d.array) [[0.5 0.7] @@ -7083,7 +6276,7 @@ def arccos(self, inplace=False): `Data` or `None` - **Examples:** + **Examples** >>> print(d.array) [[0.5 0.7] @@ -7138,7 +6331,7 @@ def arccosh(self, inplace=False): `Data` or `None` - **Examples:** + **Examples** >>> print(d.array) [[0.5 0.7] @@ -7187,7 +6380,7 @@ def all(self): `bool` Whether or not all data array elements evaluate to True. - **Examples:** + **Examples** >>> d = cf.Data([[1, 3, 2]]) >>> print(d.array) @@ -7253,7 +6446,7 @@ def allclose(self, y, rtol=None, atol=None): `bool` - **Examples:** + **Examples** >>> d = cf.Data([1000, 2500], 'metre') >>> e = cf.Data([1, 2.5], 'km') @@ -7287,7 +6480,7 @@ def any(self): .. seealso:: `all`, `allclose`, `isclose` - **Examples:** + **Examples** >>> d = cf.Data([[0, 0, 0]]) >>> d.any() @@ -7574,7 +6767,7 @@ def reconstruct_sectioned_data(cls, sections, cyclic=(), hardmask=None): `Data` The resulting reconstructed Data object. - **Examples:** + **Examples** >>> d = cf.Data(numpy.arange(120).reshape(2, 3, 4, 5)) >>> x = d.section([1, 3]) @@ -7645,7 +6838,6 @@ def argmax(self, axis=None, unravel=False): is located. unravel: `bool`, optional - If True then when locating the maximum over the whole data, return the location as an index for each axis as a `tuple`. By default an index to the flattened array @@ -7765,7 +6957,7 @@ def get_calendar(self, default=ValueError()): The calendar. - **Examples:** + **Examples** >>> d.set_calendar('julian') >>> d.get_calendar @@ -7840,122 +7032,178 @@ def set_units(self, value): """ self.Units = Units(value, self.get_calendar(default=None)) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") - def maximum( + def max( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Collapse axes with their maximum. + """Calculate maximum values. + + Calculates the maximum value or the maximum values along axes. - Missing data array elements are omitted from the calculation. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - .. seealso:: `minimum`, `mean`, `mid_range`, `sum`, `sd`, `var` + ..seealso:: `sample_size`, `maximum_absolute_value`, `min` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} - squeeze : bool, optional + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} + {{i: deprecated at version 3.0.0}} + :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. - **Examples:** + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.max() + """ - # TODODASK: Placeholder for the real thing, that takes into - # account axes=axes, squeeze=squeeze, mtol=mtol, - # inplace=inplace. - # - # This is only here for now, in this form, to ensure that - # cf.read works - return self.get_dask(copy=False).max() + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.max, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, + mtol=mtol, + ) - # return self._collapse(max_f, max_fpartial, max_ffinalise, axes=axes, - # squeeze=squeeze, mtol=mtol, inplace=inplace, - # _preserve_partitions=_preserve_partitions) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def maximum_absolute_value( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, - _preserve_partitions=False, ): - """Collapse axes with their maximum absolute value. + """Calculate maximum absolute values. + + Calculates the maximum absolute value or the maximum absolute + values along axes. - Missing data elements are omitted from the calculation. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `sum`, `sd`, - `var` + ..seealso:: `sample_size`, `max`, `minimum_absolute_value` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} - squeeze : bool, optional + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} + {{i: deprecated at version 3.0.0}} + :Returns: `Data` or `None` The collapsed data, or `None` if the operation was in-place. - **Examples:** + **Examples** - >>> d = cf.Data([[-1, 2, 3], [9, -8, -12]], 'm') + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[-99 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] >>> d.maximum_absolute_value() - - >>> d.max() - - >>> d.maximum_absolute_value(axes=1) - - >>> d.max(axes=1) - + """ - return self._collapse( - max_abs_f, - max_abs_fpartial, - max_abs_ffinalise, - axes=axes, - squeeze=squeeze, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.max_abs, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") - def minimum( + def min( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, i=False, _preserve_partitions=False, ): - """Collapse axes with their minimum. + """Calculate minimum values. - Missing data array elements are omitted from the calculation. + Calculates the minimum value or the minimum values along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `max`, `minimum_absolute_value` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} - squeeze : bool, optional + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -7964,133 +7212,139 @@ def minimum( :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. - .. seealso:: `maximum`, `mean`, `mid_range`, `sum`, `sd`, `var` + **Examples** - **Examples:** + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.min() + """ - return self._collapse( - min_f, - min_fpartial, - min_ffinalise, - axes=axes, - squeeze=squeeze, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.min, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def minimum_absolute_value( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, - _preserve_partitions=False, ): - """Collapse axes with their minimum absolute value. + """Calculate minimum absolute values. - Missing data elements are omitted from the calculation. + Calculates the minimum absolute value or the minimum absolute + values along axes. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `sum`, `sd`, - `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `maximum_absolute_value`, `min` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} - squeeze : bool, optional + .. versionadded:: TODODASK {{inplace: `bool`, optional}} + {{i: deprecated at version 3.0.0}} + :Returns: `Data` or `None` The collapsed data, or `None` if the operation was in-place. - **Examples:** + **Examples** - >>> d = cf.Data([[-1, 2, 3], [9, -8, -12]], 'm') + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[0, 0] = -99 + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[-99 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] >>> d.minimum_absolute_value() - - >>> d.d.min() - - >>> d.minimum_absolute_value(axes=1) - - >>> d.min(axes=1) - - - """ - return self._collapse( - min_abs_f, - min_abs_fpartial, - min_abs_ffinalise, - axes=axes, - squeeze=squeeze, + + + """ + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.min_abs, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def mean( self, axes=None, + weights=None, squeeze=False, mtol=1, - weights=None, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Collapse axes with their mean. + """Calculate mean values. + + Calculates the mean value or the mean values along axes. - The mean is unweighted by default, but may be weighted (see the - *weights* parameter). + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - Missing data array elements and their corresponding weights - are omitted from the calculation. + ..seealso:: `sample_size`, `mean_abslute_value`, `sd`, `sum` :Parameters: - axes: (sequence of) int, optional - The axes to be collapsed. By default flattened input is - used. Each axis is identified by its integer position. No - axes are collapsed if *axes* is an empty sequence. + {{collapse axes: (sequence of) `int`, optional}} - squeeze: `bool`, optional - If True then collapsed axes are removed. By default the - axes which are collapsed are left in the result as axes - with size 1, meaning that the result is guaranteed to - broadcast correctly against the original array. - - weights: data-like or dict, optional - Weights associated with values of the array. By default - all non-missing elements of the array are assumed to have - a weight equal to one. If *weights* is a data-like object - then it must have either the same shape as the array or, - if that is not the case, the same shape as the axes being - collapsed. If *weights* is a dictionary then each key is - axes of the array (an int or tuple of ints) with a - corresponding data-like value of weights for those - axes. In this case, the implied weights array is the outer - product of the dictionary's values. + {{weights: data_like, `dict`, or `None`, optional}} - *Parameter example:* - If ``weights={1: w, (2, 0): x}`` then ``w`` must contain - 1-dimensional weights for axis 1 and ``x`` must contain - 2-dimensional weights for axes 2 and 0. This is - equivalent, for example, to ``weights={(1, 2, 0), y}``, - where ``y`` is the outer product of ``w`` and ``x``. If - ``axes=[1, 2, 0]`` then ``weights={(1, 2, 0), y}`` is - equivalent to ``weights=y``. If ``axes=None`` and the - array is 3-dimensional then ``weights={(1, 2, 0), y}`` - is equivalent to ``weights=y.transpose([2, 0, 1])``. - - mtol: number, optional + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -8099,143 +7353,76 @@ def mean( :Returns: `Data` or `None` - The collapsed array. - - .. seealso:: `maximum`, `minimum`, `mid_range`, `range`, `sum`, `sd`, - `var` + The collapsed data, or `None` if the operation was + in-place. - **Examples:** + **Examples** - >>> d = cf.Data([[1, 2, 4], [1, 4, 9]], 'm') + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked >>> print(d.array) - [[1 2 4] - [1 4 9]] - + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] >>> d.mean() - - >>> d.mean(squeeze=True) - - >>> d.mean(axes=[0, 1]) - - >>> d.mean(axes=[1, 0]) - - >>> print(d.mean(axes=0).array) - [[1. 3. 6.5]] - >>> print(d.mean(axes=1).array) - [[2.33333333] - [4.66666667]] - >>> d.mean(axes=1, squeeze=True) - - - >>> y = cf.Data([1, 3]) - >>> x = cf.Data([1, 2, 1]) - >>> w = cf.Data.insert_dimension(y, 1) * x - >>> print(w.array) - [[1 2 1] - [3 6 3]] + + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] >>> d.mean(weights=w) - - >>> d.mean(weights={(0, 1): w}) - - >>> d.mean(axes=[0, 1], weights={(0, 1): w}) - - >>> d.mean(axes=[1, 0], weights={(0, 1): w}) - - >>> d.mean(axes=(0, 1), weights={1: x, 0: y}) - - - >>> d.mean(axes=1, weights=w) - - >>> d.mean(axes=1, weights=x) - - >>> d.mean(axes=1, weights={1: x}) - - >>> d.mean(axes=1, weights={(0, 1): w}) - - >>> d.mean(axes=1, weights={0: y, (1,): x}) - - - >>> d.mean(axes=1) - - >>> d.mean(axes=1, weights={0: y}) - - - >>> e = cf.Data(numpy.arange(24).reshape(3, 2, 4)) - >>> print(e.array) - [[[ 0 1 2 3] - [ 4 5 6 7]] - [[ 8 9 10 11] - [12 13 14 15]] - [[16 17 18 19] - [20 21 22 23]]] - - >>> e.mean(axes=[0, 2]) - - >>> f = e.mean(axes=[0, 2], squeeze=True) - >>> f - - >>> f.shape - (2,) - >>> print(e.mean(axes=[0, 1]).array) - [[[10. 11. 12. 13.]]] - >>> print(e.mean(axes=[0, 1], weights={(1, 0): w}).array) - [[[11. 12. 13. 14.]]] - - >>> e[0, 0] = cf.masked - >>> e[-1, -1] = cf.masked - >>> e[..., 2] = cf.masked - >>> print(e.array) - [[[-- -- -- --] - [4 5 -- 7]] - [[8 9 -- 11] - [12 13 -- 15]] - [[16 17 -- 19] - [-- -- -- --]]] - - >>> e.mean() - - >>> print(e.mean(axes=[0, 1]).array) - [[[10.0 11.0 -- 13.0]]] - >>> print(e.mean(axes=[0, 1], weights={(1, 0): w}).array) - [[[9.666666666666666 10.666666666666666 -- 12.666666666666666]]] - - """ - return self._collapse( - mean_f, - mean_fpartial, - mean_ffinalise, - axes=axes, - squeeze=squeeze, + + + """ + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.mean, + d, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def mean_absolute_value( self, axes=None, squeeze=False, mtol=1, weights=None, + split_every=None, inplace=False, - _preserve_partitions=False, ): - """Collapse axes with their mean absolute value. + """Calculate mean absolute values. - Missing data elements are omitted from the calculation. + Calculates the mean absolute value or the mean absolute values + along axes. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `sum`, `sd`, - `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `mean`, `sd`, `sum` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} - weights: + {{collapse squeeze: `bool`, optional}} - squeeze : bool, optional + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -8245,158 +7432,203 @@ def mean_absolute_value( The collapsed data, or `None` if the operation was in-place. - **Examples:** + **Examples** - >>> d = cf.Data([[-1, 2, 3], [9, -8, -12]], 'm') + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[0, 0] = -99 + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[-99 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] >>> d.mean_absolute_value() - - >>> d.mean_absolute_value(axes=1) - + + + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.mean_absolute_value(weights=w) + """ - return self._collapse( - mean_abs_f, - mean_abs_fpartial, - mean_abs_ffinalise, - axes=axes, - squeeze=squeeze, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.mean_abs, + d, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def integral( self, axes=None, squeeze=False, mtol=1, weights=None, + split_every=None, inplace=False, _preserve_partitions=False, ): - """Collapse axes with their integral. + """Calculate summed values. + + Calculates the sum value or the sum values along axes. - If weights are not provided then all non-missing elements are - given weighting of one such that the collapse method becomes - a `sum`. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `mean`, `sd`, `sum` :Parameters: - axes: (sequence of) int, optional - The axes to be collapsed. By default flattened input is - used. Each axis is identified by its integer position. No - axes are collapsed if *axes* is an empty sequence. + {{collapse axes: (sequence of) `int`, optional}} - squeeze: `bool`, optional - If True then collapsed axes are removed. By default the - axes which are collapsed are left in the result as axes - with size 1, meaning that the result is guaranteed to - broadcast correctly against the original array. - - weights: data-like or dict, optional - Weights associated with values of the array. By default - all non-missing elements of the array are assumed to have - a weight equal to one. If *weights* is a data-like object - then it must have either the same shape as the array or, - if that is not the case, the same shape as the axes being - collapsed. If *weights* is a dictionary then each key is - axes of the array (an int or tuple of ints) with a - corresponding data-like value of weights for those - axes. In this case, the implied weights array is the outer - product of the dictionary's values. - - Note that the units of the weights matter for an integral - collapse, which differs from a weighted sum in that the units - of the weights are incorporated into the result. + {{weights: data_like, `dict`, or `None`, optional}} - *Parameter example:* - If ``weights={1: w, (2, 0): x}`` then ``w`` must contain - 1-dimensional weights for axis 1 and ``x`` must contain - 2-dimensional weights for axes 2 and 0. This is - equivalent, for example, to ``weights={(1, 2, 0), y}``, - where ``y`` is the outer product of ``w`` and ``x``. If - ``axes=[1, 2, 0]`` then ``weights={(1, 2, 0), y}`` is - equivalent to ``weights=y``. If ``axes=None`` and the - array is 3-dimensional then ``weights={(1, 2, 0), y}`` - is equivalent to ``weights=y.transpose([2, 0, 1])``. - - mtol: number, optional + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} + {{i: deprecated at version 3.0.0}} + :Returns: `Data` or `None` - The collapsed data, or `None` of the operation was + The collapsed data, or `None` if the operation was in-place. - .. seealso:: `maximum`, `minimum`, `mid_range`, `range`, `sum`, `sd`, - `var` + **Examples** - **Examples:** + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.integral() + - """ - if weights is None: - units = None - else: - units = self.Units - if not units: - units = Units("1") + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.integral(weights=w) + - weights_units = getattr(weights, "Units", None) - if weights_units is not None: - units = units * weights_units - else: - for w in weights.values(): - weights_units = getattr(w, "Units", None) - if weights_units is not None: - units = units * weights_units - # --- End: if + >>> d.integral(weights=cf.Data(w, 'm')) + - return self._collapse( - sum_f, - sum_fpartial, - sum_ffinalise, - axes=axes, - squeeze=squeeze, + """ + d = _inplace_enabled_define_and_cleanup(self) + d, weights = _collapse( + Collapse.sum, + d, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - units=units, - _preserve_partitions=_preserve_partitions, ) + new_units = None + if weights is not None: + weights_units = getattr(weights, "Units", None) + if weights_units: + units = self.Units + if units: + new_units = units * weights_units + else: + new_units = weights_units + + if new_units is not None: + d.override_units(new_units, inplace=True) + + return d + + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") def sample_size( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Collapses axes with their sample size. + """Calculate sample size values. + + The sample size is the number of non-missing values. + + Calculates the sample size value or the sample size values + along axes. + + .. seealso:: `sum_of_weights` :Parameters: + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + {{inplace: `bool`, optional}} {{i: deprecated at version 3.0.0}} + :Returns: + + `Data` or `None` + The collapsed data, or `None` if the operation was + in-place. + + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.sample_size() + + """ - return self._collapse( - sample_size_f, - sample_size_fpartial, - sample_size_ffinalise, - axes=axes, - squeeze=squeeze, - weights=None, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.sample_size, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - units=Units("1"), - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + d.override_units(_units_None, inplace=True) + + return d @property @daskified(_DASKIFIED_VERBOSE) @@ -8591,7 +7823,7 @@ def compressed(self, inplace=False): **Examples** - >>> d = cf.Data(numpy.arange(12).reshape(3, 4)) + >>> d = cf.Data(numpy.arange(12).reshape(3, 4), 'm') >>> print(d.array) [[ 0 1 2 3] [ 4 5 6 7] @@ -8608,58 +7840,24 @@ def compressed(self, inplace=False): [ 0 1 2 3 4 6 7 8 9 10] >>> d = cf.Data(9) - >>> print(d.array) - 9 >>> print(d.compressed().array) - 9 + [9] """ d = _inplace_enabled_define_and_cleanup(self) - ndim = d.ndim - - if ndim != 1: - d.flatten(inplace=True) - - n_non_missing = d.count() - if n_non_missing == d.size: - return d - - comp = self.empty( - shape=(n_non_missing,), dtype=self.dtype, units=self.Units + dx = d._get_dask() + dx = da.blockwise( + np.ma.compressed, + "i", + dx.ravel(), + "i", + adjust_chunks={"i": lambda n: np.nan}, + dtype=dx.dtype, + meta=np.array((), dtype=dx.dtype), ) - # Find the number of array elements that fit in one chunk - n = int(cf_chunksize() // (self.dtype.itemsize + 1.0)) - - # Loop around each chunk's worth of elements and assign the - # non-missing values to the compressed data - i = 0 - start = 0 - for _ in range(1 + d.size // n): - if i >= d.size: - break - - array = d[i : i + n].array - if np.ma.isMA(array): - array = array.compressed() - - size = array.size - if size >= 1: - end = start + size - comp[start:end] = array - start = end - - i += n - - if not d.ndim: - comp.squeeze(inplace=True) - - if inplace: - d.__dict__ = comp.__dict__ - else: - d = comp - + d._set_dask(dx, reset_mask_hardness=False) return d @daskified(_DASKIFIED_VERBOSE) @@ -8687,7 +7885,7 @@ def cos(self, inplace=False, i=False): `Data` or `None` - **Examples:** + **Examples** >>> d.Units @@ -8731,7 +7929,7 @@ def count(self): ``int`` - **Examples:** + **Examples** >>> d = cf.Data(numpy.arange(24).reshape(3, 4)) >>> print(d.array) @@ -8756,7 +7954,8 @@ def count(self): 8 """ - # TODODASK - daskify, previously parallelise=mpi_on (not =False) + # TODODASK - simply use da.ma.count (dask>=2022.3.1) + config = self.partition_configuration(readonly=True) n = 0 @@ -9032,28 +8231,24 @@ def second(self): """ return YMDhms(self, "second") + @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) def uncompress(self, inplace=False): - """Uncompress the underlying data. - - Compression saves space by identifying and removing unwanted - missing data. Such compression techniques store the data more - efficiently and result in no precision loss. - - Whether or not the data is compressed does not alter its - functionality nor external appearance. + """Uncompress the data. - Data that is already uncompressed will be returned uncompressed. + Only affects data that is compressed by convention, i.e. - The following type of compression are available: + * Ragged arrays for discrete sampling geometries (DSG) and + simple geometry cell definitions. - * Ragged arrays for discrete sampling geometries (DSG). Three - different types of ragged array representation are - supported. + * Compression by gathering. - .. + * Compression by coordinate subsampling. - * Compression by gathering. + Data that is already uncompressed is returned + unchanged. Whether the data is compressed or not does not + alter its functionality nor external appearance, but may + affect how the data are written to a dataset on disk. .. versionadded:: 3.0.6 @@ -9066,10 +8261,10 @@ def uncompress(self, inplace=False): :Returns: `Data` or `None` - The uncompressed data, or `None` of the operation was + The uncompressed data, or `None` if the operation was in-place. - **Examples:** + **Examples** >>> d.get_compression_type() 'ragged contiguous' @@ -9079,20 +8274,8 @@ def uncompress(self, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - - if not d.get_compression_type(): - if inplace: - d = None - return d - - config = d.partition_configuration(readonly=False) - - for partition in d.partitions.matrix.flat: - partition.open(config) - _ = partition.array - partition.close() - - d._del_Array(None) + if d.get_compression_type(): + d._del_Array(None) return d @@ -9102,7 +8285,7 @@ def unique(self): Returns a new object with the sorted unique elements in a one dimensional array. - **Examples:** + **Examples** >>> d = cf.Data([[4, 2, 1], [1, 2, 3]], 'metre') >>> d.unique() @@ -9260,7 +8443,7 @@ def equals( `bool` Whether or not the two instances are equal. - **Examples:** + **Examples** >>> d.equals(d) True @@ -9361,7 +8544,7 @@ def exp(self, inplace=False, i=False): `Data` or `None` - **Examples:** + **Examples** """ d = _inplace_enabled_define_and_cleanup(self) @@ -9403,7 +8586,7 @@ def insert_dimension(self, position=0, inplace=False): `Data` or `None` - **Examples:** + **Examples** """ d = _inplace_enabled_define_and_cleanup(self) @@ -9593,7 +8776,7 @@ def halo( The expanded data, or `None` if the operation was in-place. - **Examples:** + **Examples** >>> d = cf.Data(numpy.arange(12).reshape(3, 4), 'm') >>> d[-1, -1] = cf.masked @@ -9822,7 +9005,7 @@ def harden_mask(self): .. seealso:: `hardmask`, `soften_mask` - **Examples:** + **Examples** >>> d = cf.Data([1, 2, 3], hardmask=False) >>> d.hardmask @@ -9839,12 +9022,9 @@ def harden_mask(self): [1 -- 3] """ - self._map_blocks( - cf_harden_mask, - delete_source=False, - reset_mask_hardness=False, - dtype=self.dtype, - ) + dx = self.to_dask_array() + dx = dx.map_blocks(cf_harden_mask, dtype=self.dtype) + self._set_dask(dx, delete_source=False, reset_mask_hardness=False) self._hardmask = True def has_calendar(self): @@ -9922,7 +9102,7 @@ def soften_mask(self): .. seealso:: `hardmask`, `harden_mask` - **Examples:** + **Examples** >>> d = cf.Data([1, 2, 3]) >>> d.hardmask @@ -9939,12 +9119,9 @@ def soften_mask(self): [ 1 999 3] """ - self._map_blocks( - cf_soften_mask, - delete_source=False, - reset_mask_hardness=False, - dtype=self.dtype, - ) + dx = self.to_dask_array() + dx = dx.map_blocks(cf_soften_mask, dtype=self.dtype) + self._set_dask(dx, delete_source=False, reset_mask_hardness=False) self._hardmask = False @daskified(_DASKIFIED_VERBOSE) @@ -9969,7 +9146,7 @@ def filled(self, fill_value=None, inplace=False): `Data` or `None` The filled data, or `None` if the operation was in-place. - **Examples:** + **Examples** >>> d = {{package}}.Data([[1, 2, 3]]) >>> print(d.filled().array) @@ -9997,7 +9174,9 @@ def filled(self, fill_value=None, inplace=False): f"data type {d.dtype.str!r}" ) - d._map_blocks(np.ma.filled, fill_value=fill_value, dtype=d.dtype) + dx = d.to_dask_array() + dx = dx.map_blocks(np.ma.filled, fill_value=fill_value, dtype=d.dtype) + d._set_dask(dx, reset_mask_hardness=False) return d @@ -10023,7 +9202,7 @@ def first_element(self, verbose=None): The first element of the data - **Examples:** + **Examples** >>> d = cf.Data([[1, 2], [3, 4]]) >>> d.first_element() @@ -10064,7 +9243,7 @@ def second_element(self, verbose=None): The second element of the data - **Examples:** + **Examples** >>> d = cf.Data([[1, 2], [3, 4]]) >>> d.second_element() @@ -10105,7 +9284,7 @@ def last_element(self): The last element of the data - **Examples:** + **Examples** >>> d = cf.Data([[1, 2], [3, 4]]) >>> d.last_element() @@ -10326,7 +9505,7 @@ def floor(self, inplace=False, i=False): `Data` or `None` - **Examples:** + **Examples** >>> d = cf.Data([-1.9, -1.5, -1.1, -1, 0, 1, 1.1, 1.5 , 1.9]) >>> print(d.array) @@ -10365,7 +9544,7 @@ def outerproduct(self, e, inplace=False, i=False): `Data` or `None` - **Examples:** + **Examples** >>> d = cf.Data([1, 2, 3], 'metre') >>> o = d.outerproduct([4, 5, 6, 7]) @@ -10504,7 +9683,7 @@ def override_units(self, units, inplace=False, i=False): `Data` or `None` - **Examples:** + **Examples** >>> d = cf.Data(1012.0, 'hPa') >>> d.override_units('km') @@ -10548,7 +9727,7 @@ def override_calendar(self, calendar, inplace=False, i=False): `Data` or `None` - **Examples:** + **Examples** """ d = _inplace_enabled_define_and_cleanup(self) @@ -10556,6 +9735,23 @@ def override_calendar(self, calendar, inplace=False, i=False): return d + def to_dask_array(self): + """Store the data array on disk. + + There is no change to partitions whose sub-arrays are already + on disk. + + :Returns: + + `None` + + **Examples** + + >>> d.to_disk() + + """ + return self._get_dask() + @daskified(_DASKIFIED_VERBOSE) def datum(self, *index): """Return an element of the data array as a standard Python @@ -10614,7 +9810,7 @@ def datum(self, *index): A copy of the specified element of the array as a suitable Python scalar. - **Examples:** + **Examples** >>> d = cf.Data(2) >>> d.datum() @@ -10731,7 +9927,7 @@ def mask_invalid(self, inplace=False, i=False): `Data` or `None` - **Examples:** + **Examples** >>> d = cf.Data([0., 1]) >>> e = cf.Data([1., 2]) @@ -10900,7 +10096,7 @@ def masked_all(cls, shape, dtype=None, units=None, chunk=True): `Data` The new data array having all elements masked. - **Examples:** + **Examples** >>> d = cf.Data.masked_all((96, 73)) @@ -10915,28 +10111,42 @@ def masked_all(cls, shape, dtype=None, units=None, chunk=True): return cls(array, units=units, chunk=chunk) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def mid_range( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, - _preserve_partitions=False, i=False, ): - """Collapse axes with the unweighted average of their maximum - and minimum values. + """Calculate mid-range values. + + The mid-range is half of the maximum plus the minimum. - Missing data array elements are omitted from the calculation. + Calculates the mid-range value or the mid-range values along + axes. - .. seealso:: `maximum`, `minimum`, `mean`, `range`, `sum`, `sd`, `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `max`, `min`, `range` :Parameters: - axes: (sequence of) `int`, optional + {{collapse axes: (sequence of) `int`, optional}} - squeeze: `bool`, optional + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -10947,19 +10157,30 @@ def mid_range( `Data` or `None` The collapsed array. - **Examples:** + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.mid_range() + """ - return self._collapse( - mid_range_f, - mid_range_fpartial, - mid_range_ffinalise, - axes=axes, - squeeze=squeeze, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.mid_range, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") @@ -10985,7 +10206,7 @@ def flip(self, axes=None, inplace=False, i=False): `Data` or `None` - **Examples:** + **Examples** >>> d.flip() >>> d.flip(1) @@ -11059,7 +10280,7 @@ def isclose(self, y, rtol=None, atol=None): `bool` - **Examples:** + **Examples** >>> d = cf.Data([1000, 2500], 'metre') >>> e = cf.Data([1, 2.5], 'km') @@ -11106,6 +10327,71 @@ def isclose(self, y, rtol=None, atol=None): except (TypeError, NotImplementedError, IndexError): return self == y + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + def reshape(self, *shape, merge_chunks=True, limit=None, inplace=False): + """Change the shape of the data without changing its values. + + :Parameters: + + shape: `tuple` of `int`, or any number of `int` + The new shape for the data, which should be compatible + with the original shape. If an integer, then the + result will be a 1-d array of that length. One shape + dimension can be -1, in which case the value is + inferred from the length of the array and remaining + dimensions. + + merge_chunks: `bool` + When True (the default) merge chunks using the logic + in `dask.array.rechunk` when communication is + necessary given the input array chunking and the + output shape. When False, the input array will be + rechunked to a chunksize of 1, which can create very + many tasks. See `dask.array.reshape` for details. + + limit: int, optional + The maximum block size to target in bytes. If no limit + is provided, it defaults to the configuration value + ``dask.config.get('array.chunk-size')``. See + `dask.array.reshape` for details. + + :Returns: + + `Data` or `None` + The reshaped data, or `None` if the operation was + in-place. + + **Examples** + + >>> d = cf.Data(np.arange(12)) + >>> print(d.array) + [ 0 1 2 3 4 5 6 7 8 9 10 11] + >>> print(d.reshape(3, 4).array) + [[ 0 1 2 3] + [ 4 5 6 7] + [ 8 9 10 11]] + >>> print(d.reshape((4, 3)).array) + [[ 0 1 2] + [ 3 4 5] + [ 6 7 8] + [ 9 10 11]] + >>> print(d.reshape(-1, 6).array) + [[ 0 1 2 3 4 5] + [ 6 7 8 9 10 11]] + >>> print(d.reshape(1, 1, 2, 6).array) + [[[[ 0 1 2 3 4 5] + [ 6 7 8 9 10 11]]]] + >>> print(d.reshape(1, 1, -1).array) + [[[[ 0 1 2 3 4 5 6 7 8 9 10 11]]]] + + """ + d = _inplace_enabled_define_and_cleanup(self) + dx = d._get_dask() + dx = dx.reshape(*shape, merge_chunks=merge_chunks, limit=limit) + d._set_dask(dx, reset_mask_hardness=True) + return d + @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") @_inplace_enabled(default=False) @@ -11128,7 +10414,7 @@ def rint(self, inplace=False, i=False): The rounded data. If the operation was in-place then `None` is returned. - **Examples:** + **Examples** >>> d = cf.Data([-1.9, -1.5, -1.1, -1, 0, 1, 1.1, 1.5 , 1.9]) >>> print(d.array) @@ -11142,84 +10428,79 @@ def rint(self, inplace=False, i=False): d._set_dask(da.rint(dx), reset_mask_hardness=False) return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def root_mean_square( self, axes=None, squeeze=False, mtol=1, weights=None, + split_every=None, inplace=False, - _preserve_partitions=False, ): - """Collapse axes with their root mean square. + """Calculate root mean square (RMS) values. + + Calculates the RMS value or the RMS values along axes. - Missing data array elements and their corresponding weights are - omitted from the calculation. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `mean`, `sum`, :Parameters: - axes: (sequence of) int, optional - The axes to be collapsed. By default flattened input is - used. Each axis is identified by its integer position. No - axes are collapsed if *axes* is an empty sequence. + {{collapse axes: (sequence of) `int`, optional}} - squeeze: `bool`, optional - If True then collapsed axes are removed. By default the - axes which are collapsed are left in the result as axes - with size 1, meaning that the result is guaranteed to - broadcast correctly against the original array. - - weights: data-like or dict, optional - Weights associated with values of the array. By default - all non-missing elements of the array are assumed to have - a weight equal to one. If *weights* is a data-like object - then it must have either the same shape as the array or, - if that is not the case, the same shape as the axes being - collapsed. If *weights* is a dictionary then each key is - axes of the array (an int or tuple of ints) with a - corresponding data-like value of weights for those - axes. In this case, the implied weights array is the outer - product of the dictionary's values. + {{weights: data_like, `dict`, or `None`, optional}} - *Parameter example:* - If ``weights={1: w, (2, 0): x}`` then ``w`` must contain - 1-dimensional weights for axis 1 and ``x`` must contain - 2-dimensional weights for axes 2 and 0. This is - equivalent, for example, to ``weights={(1, 2, 0), y}``, - where ``y`` is the outer product of ``w`` and ``x``. If - ``axes=[1, 2, 0]`` then ``weights={(1, 2, 0), y}`` is - equivalent to ``weights=y``. If ``axes=None`` and the - array is 3-dimensional then ``weights={(1, 2, 0), y}`` - is equivalent to ``weights=y.transpose([2, 0, 1])``. - - mtol: number, optional + {{collapse squeeze: `bool`, optional}} - {{inplace: `bool`, optional}} + {{mtol: number, optional}} - {{i: deprecated at version 3.0.0}} + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + + {{inplace: `bool`, optional}} :Returns: `Data` or `None` The collapsed array. - .. seealso:: `maximum`, `minimum`, `mid_range`, `range`, `sum`, `sd`, - `var` + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.root_mean_square() + - **Examples:** + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.root_mean_square(weights=w) + """ - return self._collapse( - root_mean_square_f, - root_mean_square_fpartial, - root_mean_square_ffinalise, - axes=axes, - squeeze=squeeze, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.rms, + d, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") @@ -11254,7 +10535,7 @@ def round(self, decimals=0, inplace=False, i=False): `Data` or `None` - **Examples:** + **Examples** >>> d = cf.Data([-1.81, -1.41, -1.01, -0.91, 0.09, 1.09, 1.19, 1.59, 1.99]) >>> print(d.array) @@ -11291,7 +10572,7 @@ def stats( sum=False, sum_of_squares=False, variance=False, - weights=False, + weights=None, ): """Calculate statistics of the data. @@ -11368,24 +10649,14 @@ def stats( Calculate the square root of the weighted or unweighted mean of the squares of the values. - weights: data-like or dict, optional - The weights to apply to the calculations. By default the - statistics are unweighted. - - The weights may be contained in any scalar or array-like - object (such as a numpy array or `Data` instance) that is - broadcastable to the shape of the data. If *weights* is a - dictionary then each key is axes of the array (an `int` or - `tuple` of `int`) with a corresponding data-like value of - weights for those axes. In this case, the implied weights - array is the outer product of the dictionary's values. + {{weights: data_like, `dict`, or `None`, optional}} :Returns: `dict` The statistics. - **Examples:** + **Examples** >>> d = cf.Data([[0, 1, 2], [3, -99, 5]], mask=[[0, 0, 0], [0, 1, 0]]) >>> print(d.array) @@ -11430,15 +10701,14 @@ def stats( 'sample_size': 5} """ - no_weights = ( "minimum", + "median", "maximum", "range", "mid_range", "minimum_absolute_value", "maximum_absolute_value", - "median", "sum", "sum_of_squares", ) @@ -11462,14 +10732,13 @@ def stats( "variance", ): if all or locals()[stat]: - f = getattr(self, stat) + func = getattr(self, stat) if stat in no_weights: - value = f(squeeze=True) + value = func(squeeze=True) else: - value = f(squeeze=True, weights=weights) + value = func(squeeze=True, weights=weights) out[stat] = value - # --- End: for if all or sample_size: out["sample_size"] = int(self.sample_size()) @@ -11534,7 +10803,7 @@ def fits_in_memory(self, itemsize): `bool` - **Examples:** + **Examples** >>> print(d.fits_in_memory(8)) False @@ -11844,7 +11113,7 @@ def sin(self, inplace=False, i=False): `Data` or `None` - **Examples:** + **Examples** >>> d.Units @@ -11905,7 +11174,7 @@ def sinh(self, inplace=False): `Data` or `None` - **Examples:** + **Examples** >>> d.Units @@ -11964,7 +11233,7 @@ def cosh(self, inplace=False): `Data` or `None` - **Examples:** + **Examples** >>> d.Units @@ -12026,7 +11295,7 @@ def tanh(self, inplace=False): `Data` or `None` - **Examples:** + **Examples** >>> d.Units @@ -12133,7 +11402,7 @@ def squeeze(self, axes=None, inplace=False, i=False): `Data` or `None` The squeezed data array. - **Examples:** + **Examples** >>> v.shape (1,) @@ -12233,7 +11502,7 @@ def tan(self, inplace=False, i=False): `Data` or `None` - **Examples:** + **Examples** >>> d.Units @@ -12284,7 +11553,7 @@ def tolist(self): The (nested) list of array elements, or a scalar if the data has 0 dimensions. - **Examples:** + **Examples** >>> d = cf.Data(9) >>> d.tolist() @@ -12328,7 +11597,7 @@ def transpose(self, axes=None, inplace=False, i=False): `Data` or `None` - **Examples:** + **Examples** >>> d.shape (19, 73, 96) @@ -12394,7 +11663,7 @@ def trunc(self, inplace=False, i=False): `Data` or `None` - **Examples:** + **Examples** >>> d = cf.Data([-1.9, -1.5, -1.1, -1, 0, 1, 1.1, 1.5 , 1.9]) >>> print(d.array) @@ -12676,7 +11945,7 @@ def func( `Data` or `None` - **Examples:** + **Examples** >>> d.Units @@ -12737,27 +12006,42 @@ def func( return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def range( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, - _preserve_partitions=False, i=False, ): - """Collapse axes with the absolute difference between their - maximum and minimum values. + """Calculate range values. + + The range is the maximum minus the minimum. - Missing data array elements are omitted from the calculation. + Calculates the range value or the range values along axes. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `sample_size`, - `sd`, `sum`, `sum_of_weights`, `sum_of_weights2`, - `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `max`, `min`, `mid_range` :Parameters: + {{collapse axes: (sequence of) `int`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + {{inplace: `bool`, optional}} {{i: deprecated at version 3.0.0}} @@ -12767,20 +12051,30 @@ def range( `Data` or `None` The collapsed array. - **Examples:** + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.range() + """ - return self._collapse( - range_f, - range_fpartial, - range_ffinalise, - axes=axes, - squeeze=squeeze, - weights=None, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.range, + d, + axis=axes, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) @@ -12833,30 +12127,43 @@ def roll(self, axis, shift, inplace=False, i=False): return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def sum( self, axes=None, + weights=None, squeeze=False, mtol=1, - weights=None, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Collapse axes with their sum. + """Calculate sum values. + + Calculates the sum value or the sum values along axes. - Missing data array elements are omitted from the calculation. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `range`, - `sample_size`, `sd`, `sum_of_weights`, - `sum_of_weights2`, `var` + ..seealso:: `sample_size`, `integral`, `mean`, `sd`, + `sum_of_squares`, `sum_of_weights` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} - squeeze : bool, optional + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -12865,45 +12172,77 @@ def sum( :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. + + **Examples** + + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.sum() + - **Examples:** + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.sum(weights=cf.Data(w, 'm')) + """ - return self._collapse( - sum_f, - sum_fpartial, - sum_ffinalise, - axes=axes, - squeeze=squeeze, + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.sum, + d, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) def sum_of_squares( self, axes=None, + weights=None, squeeze=False, mtol=1, - weights=None, + split_every=None, inplace=False, - _preserve_partitions=False, ): - """Collapse axes with the sum of the squares of the values. + """Calculate sums of squares. + + Calculates the sum of squares or the sum of squares values + along axes. - Missing data array elements are omitted from the calculation. + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `range`, - `sample_size`, `sd`, `sum_of_weights`, - `sum_of_weights2`, `var` + ..seealso:: `sample_size`, `sum`, `sum_of_squares`, + `sum_of_weights2` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} - squeeze : bool, optional + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -12913,55 +12252,82 @@ def sum_of_squares( The collapsed data, or `None` if the operation was in-place. - **Examples:** + **Examples** - >>> d = cf.Data([[-1, 2, 3], [9, -8, -12]], 'm') + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] >>> d.sum_of_squares() - - >>> d.sum_of_squares(axes=1) - + - """ - units = self.Units - if units: - units = units ** 2 + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.sum_of_squares(weights=w) + - return self._collapse( - sum_of_squares_f, - sum_of_squares_fpartial, - sum_of_squares_ffinalise, + """ + d = _inplace_enabled_define_and_cleanup(self) + d.square(inplace=True) + d.sum( axes=axes, - squeeze=squeeze, weights=weights, - units=units, + squeeze=squeeze, mtol=mtol, - inplace=inplace, - _preserve_partitions=_preserve_partitions, + split_every=split_every, + inplace=True, ) + return d + @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") + @_inplace_enabled(default=False) def sum_of_weights( self, axes=None, + weights=None, squeeze=False, mtol=1, - weights=None, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Collapse axes with the sum of weights. + """Calculate sums of weights. + + Calculates the sum of weights or the sum of weights values + along axes. - Missing data array elements are omitted from the calculation. + The weights given by the *weights* parameter are internally + broadcast to the shape of the data, and those weights that are + missing data, or that correspond to the missing elements of + the data, are assigned a weight of 0. It is these processed + weights that are summed. - .. seealso:: `maximum`, `mean`, `mid_range`, `minimum`, `range`, - `sample_size`, `sd`, `sum`, `sum_of_weights2`, `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `sum`, `sum_of_squares`, + `sum_of_weights2` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} - squeeze : bool, optional + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {[inplace: `bool`, optional}} @@ -12970,293 +12336,284 @@ def sum_of_weights( :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. - **Examples:** + **Examples** - """ - if weights is None: - units = Units() - else: - weights_units = getattr(weights, "Units", None) - if weights_units is not None: - units = weights_units - else: - units = Units("1") - for w in weights.values(): - weights_units = getattr(w, "Units", None) - if weights_units is not None: - units = units * weights_units - # --- End: if + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.sum_of_weights() + - return self._collapse( - sw_f, - sw_fpartial, - sw_ffinalise, - axes=axes, - squeeze=squeeze, + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.sum_of_weights(weights=w) + + + >>> d.sum_of_weights(weights=cf.Data(w, 'm')) + + + """ + d = _inplace_enabled_define_and_cleanup(self) + d, weights = _collapse( + Collapse.sum_of_weights, + d, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - units=units, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + units = _units_None + if weights is not None: + units = getattr(weights, "Units", None) + if units is None: + units = _units_None + + d.override_units(units, inplace=True) + + return d + + @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") + @_inplace_enabled(default=False) def sum_of_weights2( self, axes=None, + weights=None, squeeze=False, mtol=1, - weights=None, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Collapse axes with the sum of squares of weights. + """Calculate sums of squares of weights. + + Calculates the sum of squares of weights or the sum of squares + of weights values along axes. - Missing data array elements are omitted from the calculation. + The weights given by the *weights* parameter are internally + broadcast to the shape of the data, and those weights that + are missing data, or that correspond to the missing elements + of the data, are assigned a weight of 0. It is these processed + weights that are squared and summed. - .. seealso:: `maximum`, `mean`, `mid_range`, `minimum`, `range`, - `sample_size`, `sd`, `sum`, `sum_of_weights`, `var` + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `sum`, `sum_of_squares`, + `sum_of_weights` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} - squeeze : bool, optional + {{weights: data_like, `dict`, or `None`, optional}} - {{inplace: `bool`, optional}} + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK + + {[inplace: `bool`, optional}} {{i: deprecated at version 3.0.0}} :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. - **Examples:** + **Examples** - """ - if weights is None: - units = Units() - else: - weights_units = getattr(weights, "Units", None) - if weights_units is not None: - units = weights_units - else: - units = Units("1") - for w in weights.values(): - weights_units = getattr(w, "Units", None) - if weights_units is not None: - units = units * (weights_units ** 2) - # --- End: if + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.sum_of_weights2() + - return self._collapse( - sw2_f, - sw2_fpartial, - sw2_ffinalise, - axes=axes, - squeeze=squeeze, + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.sum_of_weights2(weights=w) + + + >>> d.sum_of_weights2(weights=cf.Data(w, 'm')) + + + """ + d = _inplace_enabled_define_and_cleanup(self) + d, weights = _collapse( + Collapse.sum_of_weights2, + d, + axis=axes, weights=weights, + keepdims=not squeeze, + split_every=split_every, mtol=mtol, - units=units, - inplace=inplace, - _preserve_partitions=_preserve_partitions, ) + units = _units_None + if weights is not None: + units = getattr(weights, "Units", None) + if not units: + units = _units_None + else: + units = units ** 2 + + d.override_units(units, inplace=True) + + return d + + @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") - def sd( + @_inplace_enabled(default=False) + def std( self, axes=None, squeeze=False, mtol=1, weights=None, ddof=0, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - r"""Collapse axes by calculating their standard deviation. - - The standard deviation may be adjusted for the number of degrees of - freedom and may be calculated with weighted values. - - Missing data array elements and those with zero weight are omitted - from the calculation. - - The unweighted standard deviation, :math:`s`, of :math:`N` values - :math:`x_i` with mean :math:`m` and with :math:`N-ddof` degrees of - freedom (:math:`ddof\ge0`) is: - - .. math:: s=\sqrt{\\frac{1}{N-ddof} \sum_{i=1}^{N} (x_i - m)^2} - - The weighted standard deviation, :math:`\\tilde{s}_N`, of :math:`N` - values :math:`x_i` with corresponding weights :math:`w_i`, weighted - mean :math:`\\tilde{m}` and with :math:`N` degrees of freedom is: - - .. math:: \\tilde{s}_N=\sqrt{\\frac{1}{\sum_{i=1}^{N} w_i} - \sum_{i=1}^{N} w_i(x_i - \\tilde{m})^2} - - The weighted standard deviation, :math:`\\tilde{s}`, of :math:`N` - values :math:`x_i` with corresponding weights :math:`w_i` and with - :math:`N-ddof` degrees of freedom (:math:`ddof>0`) is: - - .. math:: \\tilde{s} = \sqrt{\\frac{f \sum_{i=1}^{N} w_i}{f - \sum_{i=1}^{N} w_i - ddof}} \\tilde{s}_N - - where :math:`f` is the smallest positive number whose product with - each weight is an integer. :math:`f \sum_{i=1}^{N} w_i` is the - size of a new sample created by each :math:`x_i` having - :math:`fw_i` repeats. In practice, :math:`f` may not exist or may - be difficult to calculate, so :math:`f` is either set to a - predetermined value or an approximate value is calculated. The - approximation is the smallest positive number whose products with - the smallest and largest weights and the sum of the weights are - all integers, where a positive number is considered to be an - integer if its decimal part is sufficiently small (no greater than - :math:`10^{-8}` plus :math:`10^{-5}` times its integer part). This - approximation will never overestimate :math:`f`, so - :math:`\\tilde{s}` will never be underestimated when the - approximation is used. If the weights are all integers which are - collectively coprime then setting :math:`f=1` will guarantee that - :math:`\\tilde{s}` is exact. + r"""Calculate standard deviations. + + Calculates the standard deviation of an array or the standard + deviations along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. + + ..seealso:: `sample_size`, `mean`, `sum`, `var` :Parameters: - axes : (sequence of) `int`, optional - The axes to be collapsed. By default flattened input is - used. Each axis is identified by its integer position. No - axes are collapsed if *axes* is an empty sequence. + {{collapse axes: (sequence of) `int`, optional}} - squeeze : `bool`, optional - If True then collapsed axes are removed. By default the - axes which are collapsed are left in the result as axes - with size 1. When the collapsed axes are retained, the - result is guaranteed to broadcast correctly against the - original array. + {{weights: data_like, `dict`, or `None`, optional}} - *Parameter example:* - Suppose that an array, ``d``, has shape (2, 3, 4) and - ``e = d.sd(axis=1)``. Then ``e`` has shape (2, 1, 4) - and, for example, ``d/e`` is allowed. If ``e = - d.sd(axis=1, squeeze=True)`` then ``e`` will have shape - (2, 4) and ``d/e`` is an illegal operation. - - weights : data-like or `dict`, optional - Weights associated with values of the array. By default - all non-missing elements of the array are assumed to have - equal weights of 1. If *weights* is a data-like object - then it must have either the same shape as the array or, - if that is not the case, the same shape as the axes being - collapsed. If *weights* is a dictionary then each key is - axes of the array (an int or tuple of ints) with a - corresponding data-like value of weights for those - axes. In this case, the implied weights array is the outer - product of the dictionary's values it may be used in - conjunction with any value of *axes*, because the axes to - which the weights apply are given explicitly. + {{collapse squeeze: `bool`, optional}} - *Parameter example:* - Suppose that the original array being collapsed has - shape (2, 3, 4) and *weights* is set to a data-like - object, ``w``. If ``axes=None`` then ``w`` must have - shape (2, 3, 4). If ``axes=(0, 1, 2)`` then ``w`` must - have shape (2, 3, 4). If ``axes=(2, 0, 1)`` then ``w`` - must either have shape (2, 3, 4) or else (4, 2, 3). If - ``axes=1`` then ``w`` must either have shape (2, 3, 4) - or else (3,). If ``axes=(2, 0)`` then ``w`` must either - have shape (2, 3, 4) or else (4, 2). Suppose *weights* - is a dictionary. If ``weights={1: x}`` then ``x`` must - have shape (3,). If ``weights={1: x, (2, 0): y}`` then - ``x`` must have shape (3,) and ``y`` must have shape (4, - 2). The last example is equivalent to ``weights={(1, 2, - 0): x.outerproduct(y)}`` (see `outerproduct` for - details). - - mtol : number, optional - For each element in the output data array, the fraction of - contributing input array elements which is allowed to - contain missing data. Where this fraction exceeds *mtol*, - missing data is returned. The default is 1, meaning a - missing datum in the output array only occurs when its - contributing input array elements are all missing data. A - value of 0 means that a missing datum in the output array - occurs whenever any of its contributing input array - elements are missing data. Any intermediate value is - permitted. - - ddof : number, optional - The delta degrees of freedom. The number of degrees of - freedom used in the calculation is (N-*ddof*) where N - represents the number of elements. By default *ddof* is 0 + {{mtol: number, optional}} + + {{ddof: number}} + + By default *ddof* is 0. + + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} - {{i: deprecated at version 3.0.0}} + {{i: deprecated at version 3.0.0}} + + :Returns: + + `Data` or `None` + The collapsed data, or `None` if the operation was + in-place. + + **Examples** - :Returns: + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.std() + + >>> d.std(ddof=1) + - `Data` or `None` + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.std(ddof=1, weights=w) + - **Examples:** - - >>> d = cf.Data([1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4]) - >>> e = cf.Data([1, 2, 3, 4]) - >>> d.sd(squeeze=False) - - >>> d.sd() - - >>> e.sd(weights=[2, 3, 5, 6]) - - >>> e.sd(weights=[2, 3, 5, 6], f=1) - - >>> d.sd(ddof=0) - - >>> e.sd(ddof=0, weights=[2, 3, 5, 6]) - - - """ - return self._collapse( - sd_f, - sd_fpartial, - sd_ffinalise, + """ + d = _inplace_enabled_define_and_cleanup(self) + d.var( axes=axes, - squeeze=squeeze, weights=weights, + squeeze=squeeze, mtol=mtol, ddof=ddof, - inplace=inplace, - _preserve_partitions=_preserve_partitions, + split_every=split_every, + inplace=True, ) + d.sqrt(inplace=True) + return d + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) @_deprecated_kwarg_check("i") def var( self, axes=None, - squeeze=False, weights=None, + squeeze=False, mtol=1, ddof=0, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Collapse axes with their weighted variance. + """Calculate variances. - The units of the returned array are the square of the units of the - array. + Calculates the variance of an array or the variance values + along axes. + + See + https://ncas-cms.github.io/cf-python/analysis.html#collapse-methods + for mathematical definitions. - .. seealso:: `maximum`, `minimum`, `mean`, `mid_range`, `range`, `sum`, - `sd`, `stats` + ..seealso:: `sample_size`, `mean`, `sd`, `sum` :Parameters: - axes : (sequence of) int, optional + {{collapse axes: (sequence of) `int`, optional}} + + {{weights: data_like, `dict`, or `None`, optional}} + + {{collapse squeeze: `bool`, optional}} + + {{mtol: number, optional}} + + {{ddof: number}} - squeeze : bool, optional + By default *ddof* is 0. - weights : + {{split_every: `int` or `dict`, optional}} + + .. versionadded:: TODODASK {{inplace: `bool`, optional}} @@ -13265,29 +12622,49 @@ def var( :Returns: `Data` or `None` - The collapsed array. + The collapsed data, or `None` if the operation was + in-place. - **Examples:** + **Examples** - """ - units = self.Units - if units: - units = units ** 2 + >>> a = np.ma.arange(12).reshape(4, 3) + >>> d = cf.Data(a, 'K') + >>> d[1, 1] = np.ma.masked + >>> print(d.array) + [[0 1 2] + [3 -- 5] + [6 7 8] + [9 10 11]] + >>> d.var() + + >>> d.var(ddof=1) + - return self._collapse( - var_f, - var_fpartial, - var_ffinalise, - axes=axes, - squeeze=squeeze, + >>> w = np.linspace(1, 2, 3) + >>> print(w) + [1. 1.5 2. ] + >>> d.var(ddof=1, weights=w) + + + """ + d = _inplace_enabled_define_and_cleanup(self) + d, _ = _collapse( + Collapse.var, + d, + axis=axes, weights=weights, + keepdims=not squeeze, mtol=mtol, - units=units, ddof=ddof, - inplace=inplace, - _preserve_partitions=_preserve_partitions, + split_every=split_every, ) + units = d.Units + if units: + d.override_units(units ** 2, inplace=True) + + return d + @daskified(_DASKIFIED_VERBOSE) def section( self, axes, stop=None, chunks=False, min_step=1, mode="dictionary" @@ -13341,7 +12718,7 @@ def section( The dictionary of m dimensional sections of the Data object. - **Examples:** + **Examples** >>> d = cf.Data(np.arange(120).reshape(2, 6, 10)) >>> d @@ -13378,52 +12755,210 @@ def section( return _section(self, axes, min_step=min_step) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + def square(self, dtype=None, inplace=False): + """Calculate the element-wise square. + + .. versionadded:: TODODASK + + .. seealso:: `sqrt`, `sum_of_squares` + + :Parameters: + + dtype: data-type, optional + Overrides the data type of the output arrays. A + matching precision of the calculation should be + chosen. For example, a *dtype* of ``'int32'`` is only + allowed when the input values are integers. + + {{inplace: `bool`, optional}} + + :Returns: + + `Data` or `None` + The element-wise square of the data, or `None` if the + operation was in-place. + + **Examples** + + >>> d = cf.Data([[0, 1, 2.5, 3, 4]], 'K', mask=[[0, 0, 0, 1, 0]]) + >>> print(d.array) + [[0.0 1.0 2.5 -- 4.0]] + >>> e = d.square() + >>> e + + >>> print(e.array) + [[0.0 1.0 6.25 -- 16.0]] + + """ + d = _inplace_enabled_define_and_cleanup(self) + dx = d.to_dask_array() + dx = da.square(dx, dtype=dtype) + d._set_dask(dx, reset_mask_hardness=False) + + units = d.Units + if units: + d.override_units(units ** 2, inplace=True) + + return d + + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + def sqrt(self, dtype=None, inplace=False): + """Calculate the non-negative square root. + + .. versionadded:: TODODASK + + .. seealso:: `square` + + :Parameters: + + dtype: data-type, optional + Overrides the data type of the output arrays. A + matching precision of the calculation should be + chosen. For example, a *dtype* of ``'int32'` is not + allowed, even if the input values are perfect squares. + + {{inplace: `bool`, optional}} + + :Returns: + + `Data` or `None` + The element-wise positive square root of the data, or + `None` if the operation was in-place. + + **Examples** + + >>> d = cf.Data([[0, 1, 2, 3, 4]], 'K2', mask=[[0, 0, 0, 1, 0]]) + >>>print(d.array) + [[0 1 2 -- 4]] + >>> e = d.sqrt() + >>> e + + >>> print(e.array) + [[0.0 1.0 1.4142135623730951 -- 2.0]] + + Negative input values raise a warning but nonetheless result in NaN + or, if there are already missing values, missing data: + + >>> import warnings + >>> d = cf.Data([0, 1, -4]) + >>> print(d.array) + [ 0 1 -4] + >>> with warnings.catch_warnings(): + ... warnings.simplefilter("ignore") + ... print(d.sqrt().array) + ... + [ 0. 1. nan] + + >>> d = cf.Data([0, 1, -4], mask=[1, 0, 0]) + >>> print(d.array) + [-- 1 -4] + >>> with warnings.catch_warnings(): + ... warnings.simplefilter("ignore") + ... print(d.sqrt().array) + ... + [-- 1.0 --] + + """ + d = _inplace_enabled_define_and_cleanup(self) + dx = d.to_dask_array() + dx = da.sqrt(dx, dtype=dtype) + d._set_dask(dx, reset_mask_hardness=False) + + units = d.Units + if units: + try: + d.override_units(units ** 0.5, inplace=True) + except ValueError as e: + raise type(e)( + f"Incompatible units for taking a square root: {units!r}" + ) + + return d + # ---------------------------------------------------------------- - # Alias + # Aliases # ---------------------------------------------------------------- @property def dtarray(self): """Alias for `datetime_array`""" return self.datetime_array - def max( + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") + def maximum( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Alias for `maximum`""" - return self.maximum( + """Alias for `max`""" + return self.max( axes=axes, squeeze=squeeze, mtol=mtol, + split_every=split_every, inplace=inplace, i=i, - _preserve_partitions=_preserve_partitions, ) - def min( + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") + def minimum( self, axes=None, squeeze=False, mtol=1, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Alias for `minimum`""" - return self.minimum( + """Alias for `min`""" + return self.min( axes=axes, squeeze=squeeze, mtol=mtol, + split_every=split_every, inplace=inplace, i=i, - _preserve_partitions=_preserve_partitions, ) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") + def sd( + self, + axes=None, + squeeze=False, + mtol=1, + weights=None, + ddof=0, + split_every=None, + inplace=False, + i=False, + ): + """Alias for `std`""" + return self.std( + axes=axes, + squeeze=squeeze, + weights=weights, + mtol=mtol, + ddof=ddof, + split_every=split_every, + inplace=inplace, + i=i, + ) + + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") def standard_deviation( self, axes=None, @@ -13431,21 +12966,25 @@ def standard_deviation( mtol=1, weights=None, ddof=0, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): - """Alias for `sd`""" - return self.sd( + """Alias for `std`""" + return self.std( axes=axes, squeeze=squeeze, weights=weights, mtol=mtol, ddof=ddof, + split_every=split_every, inplace=inplace, - _preserve_partitions=_preserve_partitions, + i=i, ) + @daskified(_DASKIFIED_VERBOSE) + @_inplace_enabled(default=False) + @_deprecated_kwarg_check("i") def variance( self, axes=None, @@ -13453,9 +12992,9 @@ def variance( weights=None, mtol=1, ddof=0, + split_every=None, inplace=False, i=False, - _preserve_partitions=False, ): """Alias for `var`""" return self.var( @@ -13464,14 +13003,12 @@ def variance( weights=weights, mtol=mtol, ddof=ddof, + split_every=split_every, inplace=inplace, - _preserve_partitions=_preserve_partitions, + i=i, ) -# --- End: class - - def _size_of_index(index, size=None): """Return the number of elements resulting in applying an index to a sequence. @@ -13490,7 +13027,7 @@ def _size_of_index(index, size=None): `int` The length of the sequence resulting from applying the index. - **Examples:** + **Examples** >>> _size_of_index(slice(None, None, -2), 10) 5 @@ -13510,109 +13047,6 @@ def _size_of_index(index, size=None): return len(index) -def _overlapping_partitions(partitions, indices, axes, master_flip): - """Return the nested list of (modified) partitions which overlap the - given indices to the master array. - - :Parameters: - - partitions : cf.PartitionMatrix - - indices : tuple - - axes : sequence of str - - master_flip : list - - :Returns: - - numpy array - A numpy array of cf.Partition objects. - - **Examples:** - - >>> type(f.Data) - - >>> d._axes - ['dim1', 'dim2', 'dim0'] - >>> axis_to_position = {'dim0': 2, 'dim1': 0, 'dim2' : 1} - >>> indices = (slice(None), slice(5, 1, -2), [1,3,4,8]) - >>> x = _overlapping_partitions(d.partitions, indices, axis_to_position, master_flip) - - """ - - axis_to_position = {} - for i, axis in enumerate(axes): - axis_to_position[axis] = i - - if partitions.size == 1: - partition = partitions.matrix.item() - - # Find out if this partition overlaps the original slice - p_indices, shape = partition.overlaps(indices) - - if p_indices is None: - # This partition is not in the slice out of bounds - raise - # error? - return - - # Still here? Create a new partition - partition = partition.copy() - partition.new_part(p_indices, axis_to_position, master_flip) - partition.shape = shape - - new_partition_matrix = np.empty(partitions.shape, dtype=object) - new_partition_matrix[...] = partition - - return new_partition_matrix - # --- End: if - - # Still here? Then there are 2 or more partitions. - - partitions_list = [] - partitions_list_append = partitions_list.append - - flat_pm_indices = [] - flat_pm_indices_append = flat_pm_indices.append - - partitions_flat = partitions.matrix.flat - - i = partitions_flat.index - - for partition in partitions_flat: - # Find out if this partition overlaps the original slice - p_indices, shape = partition.overlaps(indices) - - if p_indices is None: - # This partition is not in the slice - i = partitions_flat.index - continue - - # Still here? Then this partition overlaps the slice, so - # create a new partition. - partition = partition.copy() - partition.new_part(p_indices, axis_to_position, master_flip) - partition.shape = shape - - partitions_list_append(partition) - - flat_pm_indices_append(i) - - i = partitions_flat.index - # --- End: for - - new_shape = [ - len(set(s)) - for s in np.unravel_index(flat_pm_indices, partitions.shape) - ] - - new_partition_matrix = np.empty((len(flat_pm_indices),), dtype=object) - new_partition_matrix[...] = partitions_list - new_partition_matrix.resize(new_shape) - - return new_partition_matrix - - def _broadcast(a, shape): """Broadcast an array to a given shape. @@ -13695,3 +13129,252 @@ def _where_broadcastable(data, x, name): ) return True + + +def _collapse( + func, + d, + axis=None, + weights=None, + keepdims=True, + mtol=1, + ddof=None, + split_every=None, +): + """Collapse data in-place using a given funcion. + + .. versionadded:: TODODASK + + .. seealso:: `_parse_weights` + + :Parameters: + + func: callable + The function that collapses the underlying `dask` array of + *d*. Must have the minimum signature (parameters and + default values) ``func(dx, axis=None, keepdims=False, + mtol=None, split_every=None)`` (optionally including + ``weights=None`` or ``ddof=None``), where ``dx`` is a the + dask array contained in *d*. + + d: `Data` + The data to be collapsed. + + axis: (sequence of) int, optional + The axes to be collapsed. By default all axes are + collapsed, resulting in output with size 1. Each axis is + identified by its integer position. If *axes* is an empty + sequence then the collapse is applied to each scalar + element and the reuslt has the same shape as the input + data. + + weights: data_like, `dict`, or `None`, optional + Weights associated with values of the data. By default + *weights* is `None`, meaning that all non-missing elements + of the data have a weight of 1 and all missing elements + have a weight of 0. + + If *weights* is a data_like object then it must be + broadcastable to the array. + + If *weights* is a dictionary then each key specifies axes + of the data (an `int` or `tuple` of `int`), with a + corresponding value of data_like weights for those + axes. The dimensions of a weights value must correspond to + its key axes in the same order. Not all of the axes need + weights assigned to them. The weights that will be used + will be an outer product of the dictionary's values. + + However they are specified, the weights are internally + broadcast to the shape of the data, and those weights that + are missing data, or that correspond to the missing + elements of the data, are assigned a weight of 0. + + For collapse functions that do not have a ``weights`` + parameter, *weights* must be `None`. + + keepdims: `bool`, optional + By default, the axes which are collapsed are left in the + result as dimensions with size one, so that the result + will broadcast correctly against the input array. If set + to False then collapsed axes are removed from the data. + + mtol: number, optional + The sample size threshold below which collapsed values are + set to missing data. It is defined as a fraction (between + 0 and 1 inclusive) of the contributing input data values. + + The default of *mtol* is 1, meaning that a missing datum + in the output array occurs whenever all of its + contributing input array elements are missing data. + + For other values, a missing datum in the output array + occurs whenever more than ``100*mtol%`` of its + contributing input array elements are missing data. + + ddof: number, optional + The delta degrees of freedom. The number of degrees of + freedom used in the calculation is (N-*ddof*) where N + represents the number of non-missing elements. + + For collapse functions that do not have a ``ddof`` + parameter, *ddof* must be `None`. + + split_every: `int` or `dict`, optional + Determines the depth of the recursive aggregation. See + `dask.array.reduction` for details. + + :Returns: + + (`Data`, formatted weights) + The collapsed data and the output of ``_parse_weights(d, + weights, axis)``. + + """ + kwargs = { + "axis": axis, + "keepdims": keepdims, + "split_every": split_every, + "mtol": mtol, + } + + weights = _parse_weights(d, weights, axis) + if weights is not None: + kwargs["weights"] = weights + + if ddof is not None: + kwargs["ddof"] = ddof + + dx = d.to_dask_array() + dx = func(dx, **kwargs) + d._set_dask(dx, reset_mask_hardness=True) + + return d, weights + + +def _parse_weights(d, weights, axis=None): + """Parse the weights input to `_collapse`. + + .. versionadded:: TODODASK + + .. seealso:: `_collapse` + + :Parameters: + + d: `Data` + The data to be collapsed. + + weights: data_like or `dict` + See `_collapse` for details. + + axis: (sequence of) `int`, optional + See `_collapse` for details. + + :Returns: + + `Data` or `None` + * If *weights* is a data_like object then they are + returned unchanged as a `Data` object. It is up to the + downstream functions to check if the weights can be + broadcast to the data. + + * If *weights* is a dictionary then the dictionary + values', i.e. the weights components, outer product is + returned in `Data` object that is broadcastable to the + data. + + If the dictionary is empty, or none of the axes defined + by the keys correspond to collapse axes defined by + *axis*, then then the collapse is unweighted and `None` + is returned. + + Note that, in all cases, the returned weights are *not* + modified to account for missing values in the data. + + **Examples** + + >>> d = cf.Data(np.arange(12)).reshape(4, 3) + + >>> _parse_weights(d, [1, 2, 1], (0, 1)) + + + >>> _parse_weights(d, [[1, 2, 1]], (0, 1)) + + + >>> _parse_weights(d, {1: [1, 2, 1]}, (0, 1)) + + + >>> print(_parse_weights(d, {0: [1, 2, 3, 4], 1: [1, 2, 1]}, (0, 1))) + [[1 2 1] + [2 4 2] + [3 6 3] + [4 8 4]] + + >>> print(cf.data.data._parse_weights(d, {}, (0, 1))) + None + + >>> print(cf.data.data._parse_weights(d, {1: [1, 2, 1]}, 0)) + None + + """ + if weights is None: + # No weights + return + + if not isinstance(weights, dict): + # Weights is data_like. Don't check broadcastability to d, + # leave that to whatever uses the weights. + return Data.asdata(weights) + + if not weights: + # No weights (empty dictionary) + return + + if axis is None: + axis = tuple(range(d.ndim)) + else: + axis = d._parse_axes(axis) + + weights = weights.copy() + weights_axes = set() + for key, value in tuple(weights.items()): + del weights[key] + key = d._parse_axes(key) + if weights_axes.intersection(key): + raise ValueError("Duplicate weights axis") + + weights[tuple(key)] = value + weights_axes.update(key) + + if not weights_axes.intersection(axis): + # No weights span collapse axes + return + + # For each component, add missing dimensions as size 1. + w = [] + shape = d.shape + for key, value in weights.items(): + value = Data.asdata(value) + + # Make sure axes are in ascending order + skey = tuple(sorted(key)) + if key != skey: + value = value.transpose(skey) + key = skey + + if not all( + True if i in (j, 1) else False + for i, j in zip(value.shape, [shape[i] for i in key]) + ): + raise ValueError( + f"Weights component for axes {tuple(key)} with shape " + f"{value.shape} is not broadcastable to data with " + f"shape {shape}" + ) + + new_shape = [n if i in key else 1 for i, n in enumerate(shape)] + w.append(value.reshape(new_shape)) + + # Return the product of the weights components, which will be + # broadcastable to d + return reduce(mul, w) diff --git a/cf/data/functions.py b/cf/data/functions.py index 26895d5c76..7a7405fd7f 100644 --- a/cf/data/functions.py +++ b/cf/data/functions.py @@ -30,7 +30,7 @@ def _open_netcdf_file(filename, mode, fmt="NETCDF4"): # set_auto_mask=True): `netCDF4.Dataset` A netCDF4.Dataset instance for the netCDF file. - **Examples:** + **Examples** >>> nc1 = _open_netcdf_file('file.nc') >>> nc1 diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py index 20026be866..fd2661ae4a 100644 --- a/cf/data/mixin/deprecations.py +++ b/cf/data/mixin/deprecations.py @@ -69,6 +69,31 @@ def __hash__(self): removed_at="5.0.0", ) + @property + def _HDF_chunks(self): + """The HDF chunksizes. + + Deprecated at version TODODASK. + + DO NOT CHANGE IN PLACE. + + """ + _DEPRECATION_ERROR_ATTRIBUTE( + self, "_HDF_chunks", version="TODODASK", removed_at="5.0.0" + ) # pragma: no cover + + @_HDF_chunks.setter + def _HDF_chunks(self, value): + _DEPRECATION_ERROR_ATTRIBUTE( + self, "_HDF_chunks", version="TODODASK", removed_at="5.0.0" + ) # pragma: no cover + + @_HDF_chunks.deleter + def _HDF_chunks(self): + _DEPRECATION_ERROR_ATTRIBUTE( + self, "_HDF_chunks", version="TODODASK", removed_at="5.0.0" + ) # pragma: no cover + @property def Data(self): """Deprecated at version 3.0.0, use attribute `data` instead.""" @@ -161,7 +186,7 @@ def fits_in_one_chunk_in_memory(self, itemsize): def ispartitioned(self): """True if the data array is partitioned. - **Examples:** + **Examples** >>> d._pmsize 1 @@ -209,7 +234,7 @@ def chunk(self, chunksize=None, total=None, omit_axes=None, pmshape=None): `None` - **Examples:** + **Examples** >>> d.chunk() >>> d.chunk(100000) @@ -307,7 +332,7 @@ def ismasked(self): TODODASK - **Examples:** + **Examples** >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) >>> print(d.ismasked) @@ -330,7 +355,7 @@ def varray(self): .. seealso:: `array`, `datetime_array` - **Examples:** + **Examples** >>> a = d.varray >>> type(a) @@ -359,7 +384,7 @@ def add_partitions(self, extra_boundaries, pdim): `None` - **Examples:** + **Examples** >>> d.add_partitions( ) @@ -376,7 +401,7 @@ def partition_boundaries(self): `dict` - **Examples:** + **Examples** """ _DEPRECATION_ERROR_METHOD( diff --git a/cf/data/partition.py b/cf/data/partition.py index 3f61e30724..cc52c69ba7 100644 --- a/cf/data/partition.py +++ b/cf/data/partition.py @@ -96,7 +96,7 @@ def _remove_temporary_files(filename=None): `None` - **Examples:** + **Examples** >>> _remove_temporary_files() @@ -241,7 +241,7 @@ def __init__( The units of the partition's subarray. DO NOT UPDATE INPLACE. - **Examples:** + **Examples** >>> p = Partition(subarray = numpy.arange(20).reshape(2,5,1), ... location = [(0, 6), (1, 3), (4, 5)], @@ -352,7 +352,7 @@ def __del__(self): # `dict` # A dictionary of the instance's attributes # - # **Examples:** + # **Examples** # # ''' # return dict([(attr, getattr(self, attr)) @@ -465,7 +465,7 @@ def _configure_auxiliary_mask(self, auxiliary_mask): `None` - **:Examples:** + **:Examples** >>> p._configure_auxiliary_mask([mask_component1, mask_component2]) @@ -506,7 +506,7 @@ def indices(self): A tuple of slice objects or, if the master data array is a scalar array, an empty tuple. - **Examples:** + **Examples** >>> p.location [(0, 5), (2, 9)] @@ -526,7 +526,7 @@ def in_memory(self): """True if and only if the partition's subarray is in memory as opposed to on disk. - **Examples:** + **Examples** >>> p.in_memory False @@ -542,7 +542,7 @@ def in_memory(self): # # .. seealso:: `array`, `in_memory`, `on_disk`, `to_shared_memory` # - # **Examples:** + # **Examples** # # >>> p.in_shared_memory # True @@ -561,7 +561,7 @@ def in_cached_file(self): .. seealso:: `array`, `in_memory`, `in_shared_memory`, `on_disk`, `to_disk` - **Examples:** + **Examples** >>> p.in_cached_file False @@ -574,7 +574,7 @@ def on_disk(self): """True if and only if the partition's subarray is on disk as opposed to in memory. - **Examples:** + **Examples** >>> p.on_disk True @@ -590,7 +590,7 @@ def in_file(self): """True if and only if the partition's subarray is on disk as opposed to in memory. - **Examples:** + **Examples** >>> p.on_disk True @@ -611,7 +611,7 @@ def isscalar(self): """True if and only if the partition's data array is a scalar array. - **Examples:** + **Examples** >>> p.axes [] @@ -657,7 +657,7 @@ def size(self): """Number of elements in the partition's data array (not its subarray). - **Examples:** + **Examples** >>> p.shape (73, 48) @@ -691,7 +691,7 @@ def subarray(self): # # True if and only if the partition's subarray is in an external file. # - # **Examples:** + # **Examples** # # >>> p.subarray_in_external_file # False @@ -715,7 +715,7 @@ def change_axis_names(self, axis_map): `None` - **Examples:** + **Examples** >>> p.axes ['dim0', 'dim1'] @@ -770,7 +770,7 @@ def close(self, **kwargs): None - **Examples:** + **Examples** >>> p.array(...) >>> p.close() @@ -951,7 +951,7 @@ def copy(self): A deep copy. - **Examples:** + **Examples** >>> q = p.copy() @@ -1231,7 +1231,7 @@ def array(self): def isdt(self): """True if the subarray contains date-time objects. - **Examples:** + **Examples** >>> p.Units.isreftime True @@ -1250,7 +1250,7 @@ def file_close(self): `None` - **Examples:** + **Examples** >>> p.file_close() @@ -1271,7 +1271,7 @@ def file_close(self): # generator # An iterator that yields the partition itself. # - # **Examples:** + # **Examples** # # >>> type(p.flat()) # @@ -1297,7 +1297,7 @@ def file_close(self): # generator # An iterator over indices of the partition's data array. # - # **Examples:** + # **Examples** # # >>> p.shape # [2, 1, 3] @@ -1330,7 +1330,7 @@ def inspect(self): None - **Examples:** + **Examples** >>> f.inspect() @@ -1347,7 +1347,7 @@ def master_ndindex(self): # itermaster_indices(self): An iterator over indices of the master array which are spanned by the data array. - **Examples:** + **Examples** >>> p.location [(3, 5), (0, 1), (0, 3)] @@ -1382,7 +1382,7 @@ def new_part(self, indices, master_axis_to_position, master_flip): None - **Examples:** + **Examples** >>> p.new_part(indices, dim2position, master_flip) @@ -1673,7 +1673,7 @@ def overlaps(self, indices): partition's shape as a list. Otherwise return `None`, `None`. - **Examples:** + **Examples** >>> indices = (slice(None), slice(5, 1, -2), [1, 3, 4, 8]) >>> p.overlaps(indices) @@ -1773,7 +1773,7 @@ def to_disk(self, reopen=True): `bool` True if the subarray was moved to disk, False otherwise. - **Examples:** + **Examples** >>> p.to_disk() >>> p.to_disk(reopen=False) @@ -1850,7 +1850,7 @@ def revert(self): `None` - **Examples:** + **Examples** >>> p.revert() @@ -1884,7 +1884,7 @@ def update_inplace_from(self, other): `None` - **Examples:** + **Examples** >>> p.update_inplace_from(q) diff --git a/cf/data/utils.py b/cf/data/utils.py index d274b5230b..176e3882aa 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -31,7 +31,7 @@ def _is_numeric_dtype(array): `bool` Whether or not the array holds numeric elements. - **Examples:** + **Examples** >>> a = np.array([0, 1, 2]) >>> cf.data.utils._is_numeric_dtype(a) @@ -377,7 +377,7 @@ def new_axis_identifier(existing_axes=(), basename="dim"): `str` The new axis idenfifier. - **Examples:** + **Examples** >>> cf.data.utils.new_axis_identifier() 'dim0' @@ -668,6 +668,8 @@ def YMDhms(d, attr): raise ValueError(f"Can't get {attr}s from data with {units!r}") d = d._asdatetime() - d._map_blocks(partial(cf_YMDhms, attr=attr), dtype=int) + dx = d.to_dask_array() + dx = dx.map_blocks(partial(cf_YMDhms, attr=attr), dtype=int) + d._set_dask(dx, reset_mask_hardness=False) d.override_units(Units(None), inplace=True) return d diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index 188d454dbd..26537d7cec 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -28,14 +28,8 @@ # ---------------------------------------------------------------- # Class description susbstitutions (1 level of indentation) # ---------------------------------------------------------------- - "{{formula terms links}}": """See the parametric vertical coordinate sections of the CF - conventions for more details: - - `4.3.3. Parametric Vertical Coordinate - `_ - - `Appendix D: Parametric Vertical Coordinates - `_""", + "{{formula terms links}}": """See CF section 4.3.3 "Parametric Vertical Coordinate" and CF + Appendix D "Parametric Vertical Coordinates" for details.""", # ---------------------------------------------------------------- # Class description susbstitutions (1 level of indentation) # ---------------------------------------------------------------- @@ -245,6 +239,108 @@ domain axis. If the vertical axis does not appear in the computed non-parametric coodinates then this an empty tuple.""", + # collapse axes + "{{collapse axes: (sequence of) `int`, optional}}": """axes: (sequence of) `int`, optional + The axes to be collapsed. By default all axes are + collapsed, resulting in output with size 1. Each axis + is identified by its integer position. If *axes* is an + empty sequence then the collapse is applied to each + scalar element and the result has the same shape as + the input data.""", + # collapse squeeze + "{{collapse squeeze: `bool`, optional}}": """squeeze: `bool`, optional + By default, the axes which are collapsed are left in + the result as dimensions with size one, so that the + result will broadcast correctly against the input + array. If set to True then collapsed axes are removed + from the data.""", + # collapse keepdims + "{{collapse keepdims: `bool`, optional}}": """keepdims: `bool`, optional + By default, the axes which are collapsed are left in + the result as dimensions with size one, so that the + result will broadcast correctly against the input + array. If set to False then collapsed axes are removed + from the data.""", + # weights + "{{weights: data_like, `dict`, or `None`, optional}}": """weights: data_like, `dict`, or `None`, optional + Weights associated with values of the data. By default + *weights* is `None`, meaning that all non-missing + elements of the data have a weight of 1 and all + missing elements have a weight of 0. + + If *weights* is a data_like object then it must be + broadcastable to the array. + + If *weights* is a dictionary then each key specifies + axes of the data (an `int` or `tuple` of `int`), with + a corresponding value of data_like weights for those + axes. The dimensions of a weights value must + correspond to its key axes in the same order. Not all + of the axes need weights assigned to them. The weights + that will be used will be an outer product of the + dictionary's values. + + However they are specified, the weights are internally + broadcast to the shape of the data, and those weights + that are missing data, or that correspond to the + missing elements of the data, are assigned a weight of + 0.""", + # collapse mtol + "{{mtol: number, optional}}": """mtol: number, optional + The sample size threshold below which collapsed values + are set to missing data. It is defined as a fraction + (between 0 and 1 inclusive) of the contributing input + data values. + + The default of *mtol* is 1, meaning that a missing + datum in the output array occurs whenever all of its + contributing input array elements are missing data. + + For other values, a missing datum in the output array + occurs whenever more than ``100*mtol%`` of its + contributing input array elements are missing data. + + Note that for non-zero values of *mtol*, different + collapsed elements may have different sample sizes, + depending on the distribution of missing data in the + input data.""", + # ddof + "{{ddof: number}}": """ddof: number + The delta degrees of freedom, a non-negative + number. The number of degrees of freedom used in the + calculation is (N-*ddof*) where N represents the + number of non-missing elements. A value of 1 applies + Bessel's correction. If the calculation is weighted + then *ddof* can only be 0 or 1.""", + # split_every + "{{split_every: `int` or `dict`, optional}}": """split_every: `int` or `dict`, optional + Determines the depth of the recursive aggregation. If + set to or more than the number of input chunks, the + aggregation will be performed in two steps, one + partial collapse per input chunk and a single + aggregation at the end. If set to less than that, an + intermediate aggregation step will be used, so that + any of the intermediate or final aggregation steps + operates on no more than ``split_every`` inputs. The + depth of the aggregation graph will be + :math:`log_{split_every}(input chunks along reduced + axes)`. Setting to a low value can reduce cache size + and network transfers, at the cost of more CPU and a + larger dask graph. + + By default, `dask` heuristically decides on a good + value. A default can also be set globally with the + ``split_every`` key in `dask.config`. See + `dask.array.reduction` for details.""", + # Collapse weights + "{{Collapse weights: data_like or `None`, optional}}": """weights: data_like or `None`, optional + Weights associated with values of the array. By + default *weights* is `None`, meaning that all + non-missing elements of the array are assumed to have + a weight equal to one. + + When *weights* is a data_like object then it must have + the same shape as the array.""", # ---------------------------------------------------------------- # Method description susbstitutions (4 levels of indentataion) # ---------------------------------------------------------------- diff --git a/cf/formula_terms.py b/cf/formula_terms.py index 8ecb993ecf..bf1bdd1214 100644 --- a/cf/formula_terms.py +++ b/cf/formula_terms.py @@ -1,6 +1,6 @@ import logging -import cfdm +from cfdm.core import DocstringRewriteMeta from .constants import ( formula_terms_computed_standard_names, @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) -class FormulaTerms(metaclass=cfdm.core.DocstringRewriteMeta): +class FormulaTerms(metaclass=DocstringRewriteMeta): """Functions for computing non-parametric vertical coordinates from the formula defined by a coordinate reference construct. @@ -64,13 +64,13 @@ def __docstring_substitutions__(self): return _docstring_substitution_definitions def __docstring_package_depth__(self): - """Return the package depth for {{package}} docstring + """Return the package depth for "package" docstring substitutions. See `_docstring_package_depth` for details. """ - return 1 + return 0 # ---------------------------------------------------------------- # Private methods @@ -225,8 +225,6 @@ def _computed_standard_name(f, standard_name, coordinate_reference): """Find the standard name of the computed non-parametric vertical coordinates. - {{formula terms links}} - .. versionadded:: 3.8.0 :Parameters: @@ -558,8 +556,6 @@ def _check_standard_name_consistency( s-coordinate, the ocean_sigma over z coordinate, and the ocean double sigma coordinate. - {{formula terms links}} - .. versionadded:: 3.8.0 :Parameters: @@ -680,8 +676,6 @@ def atmosphere_ln_pressure_coordinate( """Compute non-parametric vertical coordinates from atmosphere_ln_pressure_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -761,8 +755,6 @@ def atmosphere_sigma_coordinate( """Compute non-parametric vertical coordinates from atmosphere_sigma_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -858,8 +850,6 @@ def atmosphere_hybrid_sigma_pressure_coordinate( atmosphere_hybrid_sigma_pressure_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -989,8 +979,6 @@ def atmosphere_hybrid_height_coordinate( """Compute non-parametric vertical coordinates from atmosphere_hybrid_height_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1087,8 +1075,6 @@ def atmosphere_sleve_coordinate( """Compute non-parametric vertical coordinates from atmosphere_sleve_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1242,8 +1228,6 @@ def ocean_sigma_coordinate( """Compute non-parametric vertical coordinates from ocean_sigma_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1347,8 +1331,6 @@ def ocean_s_coordinate( """Compute non-parametric vertical coordinates from ocean_s_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1489,8 +1471,6 @@ def ocean_s_coordinate_g1( """Compute non-parametric vertical coordinates from ocean_s_coordinate_g1 parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1618,8 +1598,6 @@ def ocean_s_coordinate_g2( """Compute non-parametric vertical coordinates from ocean_s_coordinate_g2 parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1747,8 +1725,6 @@ def ocean_sigma_z_coordinate( """Compute non-parametric vertical coordinates from ocean_sigma_z_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -1897,8 +1873,6 @@ def ocean_double_sigma_coordinate( """Compute non-parametric vertical coordinates from ocean_double_sigma_coordinate parametric coordinates. - {{formula terms links}} - .. note:: The vertical axis is the last (rightmost) dimension of the returned computed non-parametric vertical coordinates, if applicable. @@ -2051,8 +2025,6 @@ def formula( ): """Compute non-parametric vertical coordinates. - {{formula terms links}} - Dimensional vertical auxiliary coordinate values are computed from parametric vertical coordinate values (usually dimensionless) and associated domain ancillary constructs, as defined by the formula diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index fd6d2e1e35..6925bc8f13 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -43,6 +43,7 @@ ma[0, 3, :, 3] = np.ma.masked ma[1, 2, 3, :] = np.ma.masked +mw = np.ma.array(w, mask=ma.mask) # If True, all tests that will not pass temporarily due to the LAMA-to-Dask # migration will be skipped. These skips will be incrementally removed as the @@ -60,14 +61,23 @@ def reshape_array(a, axes): return b -class DataTest(unittest.TestCase): - - axes_combinations = [ +def axis_combinations(a): + return [ axes for n in range(1, a.ndim + 1) for axes in itertools.combinations(range(a.ndim), n) ] + +class DataTest(unittest.TestCase): + + axes_combinations = axis_combinations(a) + # [ + # axes + # for n in range(1, a.ndim + 1) + # for axes in itertools.combinations(range(a.ndim), n) + # ] + filename = os.path.join( os.path.dirname(os.path.abspath(__file__)), "test_file.nc" ) @@ -81,6 +91,7 @@ class DataTest(unittest.TestCase): a = a w = w ma = ma + mw = mw ones = ones test_only = [] @@ -664,51 +675,59 @@ def test_Data_diff(self): self.assertTrue((a_diff == d_diff).all()) self.assertTrue((a_diff.mask == d_diff.mask).all()) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_compressed(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - a = np.ma.arange(12).reshape(3, 4) + d = cf.Data(a, "m", chunks=2) + self.assertIsNone(d.compressed(inplace=True)) + self.assertEqual(d.shape, (a.size,)) + self.assertEqual(d.Units, cf.Units("m")) + self.assertEqual(d.dtype, a.dtype) - d = cf.Data(a) - self.assertTrue((d.array == a).all()) - self.assertTrue((a.compressed() == d.compressed()).all()) - - e = d.copy() - x = e.compressed(inplace=True) - self.assertIsNone(x) - self.assertTrue(e.equals(d.compressed())) - - a[1, 1] = np.ma.masked - a[2, 3] = np.ma.masked + d = cf.Data(a, "m", chunks=2) + self.assertTrue((d.compressed().array == a.compressed()).all()) - d = cf.Data(a) - self.assertTrue((d.array == a).all()) - self.assertTrue((d.mask.array == a.mask).all()) - self.assertTrue((a.compressed() == d.compressed()).all()) + a[2] = np.ma.masked + d = cf.Data(a, "m", chunks=2) + self.assertTrue((d.compressed().array == a.compressed()).all()) - e = d.copy() - x = e.compressed(inplace=True) - self.assertIsNone(x) - self.assertTrue(e.equals(d.compressed())) + a[...] = np.ma.masked + d = cf.Data(a, "m", chunks=2) + e = d.compressed() + self.assertEqual(e.shape, (0,)) + self.assertTrue((e.array == a.compressed()).all()) - d = cf.Data(self.a, "km") - self.assertTrue((self.a.flatten() == d.compressed()).all()) + # Scalar arrays + a = np.ma.array(9) + d = cf.Data(a, "m") + e = d.compressed() + self.assertEqual(e.shape, (1,)) + self.assertTrue((e.array == a.compressed()).all()) - d = cf.Data(self.ma, "km") - self.assertTrue((self.ma.compressed() == d.compressed()).all()) + a = np.ma.array(9, mask=True) + d = cf.Data(a, "m") + e = d.compressed() + self.assertEqual(e.shape, (0,)) + self.assertTrue((e.array == a.compressed()).all()) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_shape'") + @unittest.skipIf(TEST_DASKIFIED_ONLY, "Needs __eq__") def test_Data_stats(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return + d = cf.Data([1, 1]) - d = cf.Data([[0, 1, 2], [3, -99, 5]], mask=[[0, 0, 0], [0, 1, 0]]) - - self.assertIsInstance(d.stats(), dict) - _ = d.stats(all=True) - _ = d.stats(mean_of_upper_decile=True, range=False) + self.assertEqual( + d.stats(sum=True, weights=1), + { + "minimum": 1, + "mean": 1.0, + "median": 1.0, + "maximum": 1, + "range": 0, + "mid_range": 1.0, + "standard_deviation": 0.0, + "root_mean_square": 1.0, + "sum": 2, + "sample_size": 2, + }, + ) @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_shape'") def test_Data__init__dtype_mask(self): @@ -1811,58 +1830,22 @@ def test_Data_flip(self): self.assertEqual(d[0].shape, (1, 4, 5)) self.assertEqual(d[-1].shape, (1, 4, 5)) - self.assertEqual(d[0].maximum(), 4 * 5) - self.assertEqual(d[-1].maximum(), 3 * 4 * 5) + self.assertEqual(d[0].max().array, 4 * 5) + self.assertEqual(d[-1].max().array, 3 * 4 * 5) for i in (2, 1): e = d.flip(i) self.assertEqual(e[0].shape, (1, 4, 5)) self.assertEqual(e[-1].shape, (1, 4, 5)) - self.assertEqual(e[0].maximum(), 4 * 5) - self.assertEqual(e[-1].maximum(), 3 * 4 * 5) + self.assertEqual(e[0].max().array, 4 * 5) + self.assertEqual(e[-1].max().array, 3 * 4 * 5) i = 0 e = d.flip(i) self.assertEqual(e[0].shape, (1, 4, 5)) self.assertEqual(e[-1].shape, (1, 4, 5)) - self.assertEqual(e[0].maximum(), 3 * 4 * 5) - self.assertEqual(e[-1].maximum(), 4 * 5) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute 'datum'") - def test_Data_max(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (False, True): - d = cf.Data([[4, 5, 6], [1, 2, 3]], "metre") - self.assertEqual( - d.maximum(_preserve_partitions=pp), cf.Data(6, "metre") - ) - self.assertEqual(d.maximum(_preserve_partitions=pp).datum(), 6) - d[0, 2] = cf.masked - self.assertEqual(d.maximum(_preserve_partitions=pp), 5) - self.assertEqual(d.maximum(_preserve_partitions=pp).datum(), 5) - self.assertEqual( - d.maximum(_preserve_partitions=pp), cf.Data(0.005, "km") - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_min(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (False, True): - d = cf.Data([[4, 5, 6], [1, 2, 3]], "metre") - self.assertEqual( - d.minimum(_preserve_partitions=pp), cf.Data(1, "metre") - ) - self.assertEqual(d.minimum(_preserve_partitions=pp).datum(), 1) - d[1, 0] = cf.masked - self.assertEqual(d.minimum(_preserve_partitions=pp), 2) - self.assertEqual(d.minimum(_preserve_partitions=pp).datum(), 2) - self.assertEqual( - d.minimum(_preserve_partitions=pp), cf.Data(0.002, "km") - ) + self.assertEqual(e[0].max().array, 3 * 4 * 5) + self.assertEqual(e[-1].max().array, 4 * 5) def test_Data_ndindex(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: @@ -2334,32 +2317,6 @@ def test_Data__int__(self): with self.assertRaises(Exception): _ = int(cf.Data([1, 2])) - def test_Data__round__(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for ndigits in ([], [0], [1], [2], [3]): - for x in ( - -1.9123, - -1.5789, - -1.4123, - -1.789, - 0, - 1.123, - 1.0234, - 1.412, - 1.9345, - ): - self.assertEqual( - round(cf.Data(x), *ndigits), round(x, *ndigits) - ) - self.assertEqual( - round(cf.Data(x), *ndigits), round(x, *ndigits) - ) - - with self.assertRaises(Exception): - _ = round(cf.Data([1, 2])) - def test_Data_argmax(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -2387,154 +2344,6 @@ def test_Data_argmax(self): with self.assertRaises(Exception): d.argmax(axis=d.ndim) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "hits 'NoneType' is not iterable") - def test_Data__collapse_SHAPE(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - a = np.arange(-100, 200.0, dtype=float).reshape(3, 4, 5, 5) - - for h in ( - "sample_size", - "sum", - "min", - "max", - "mean", - "var", - "sd", - "mid_range", - "range", - "integral", - "maximum_absolute_value", - "minimum_absolute_value", - "sum_of_squares", - "root_mean_square", - "mean_absolute_value", - "median", - "mean_of_upper_decile", - "sum_of_weights", - "sum_of_weights2", - ): - - d = cf.Data(a[(slice(None, None, -1),) * a.ndim].copy()) - d.flip(inplace=True) - _ = cf.Data(self.w.copy()) - - shape = list(d.shape) - - for axes in self.axes_combinations: - e = getattr(d, h)( - axes=axes, squeeze=False, _preserve_partitions=False - ) - - shape = list(d.shape) - for i in axes: - shape[i] = 1 - - shape = tuple(shape) - self.assertEqual( - e.shape, - shape, - "{}, axes={}, not squeezed bad shape: {} != {}".format( - h, axes, e.shape, shape - ), - ) - - for axes in self.axes_combinations: - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=False - ) - shape = list(d.shape) - for i in sorted(axes, reverse=True): - shape.pop(i) - - shape = tuple(shape) - self.assertEqual( - e.shape, - shape, - "{}, axes={}, squeezed bad shape: {} != {}".format( - h, axes, e.shape, shape - ), - ) - - e = getattr(d, h)(squeeze=True, _preserve_partitions=False) - shape = () - self.assertEqual( - e.shape, - shape, - "{}, axes={}, squeezed bad shape: {} != {}".format( - h, None, e.shape, shape - ), - ) - - e = getattr(d, h)(squeeze=False, _preserve_partitions=False) - shape = (1,) * d.ndim - self.assertEqual( - e.shape, - shape, - "{}, axes={}, not squeezed bad shape: {} != {}".format( - h, None, e.shape, shape - ), - ) - # --- End: for - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_max_min_sum_sum_of_squares(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for _np, h in zip( - (np.sum, np.amin, np.amax, np.sum), - ("sum", "min", "max", "sum_of_squares"), - ): - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - if h == "sum_of_squares": - b = b ** 2 - - b = _np(b, axis=-1) - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) - - # unweighted, masked - d = cf.Data(self.ma) - for _np, h in zip( - (np.ma.sum, np.ma.amin, np.ma.amax, np.ma.sum), - ("sum", "min", "max", "sum_of_squares"), - ): - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - if h == "sum_of_squares": - b = b ** 2 - - b = _np(b, axis=-1) - b = np.ma.asanyarray(b) - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, \nb.mask={}".format( - h, axes, e.mask.array, b.mask - ), - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) - def test_Data_percentile_median(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -2658,632 +2467,6 @@ def test_Data_percentile_median(self): with self.assertRaises(ValueError): d.percentile(q).array - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") - def test_Data_mean_of_upper_decile(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - p = np.percentile(b, 90, axis=-1, keepdims=True) - b = np.ma.where(b < p, np.ma.masked, b) - b = np.average(b, axis=-1) - - e = d.mean_of_upper_decile( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "mean_of_upper_decile, axis={}, unweighted, " - "unmasked \ne={}, \nb={}".format(axes, e.array, b), - ) - - # unweighted, masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - b = np.ma.filled(b, np.nan) - with np.testing.suppress_warnings() as sup: - sup.filter( - RuntimeWarning, message=".*All-NaN slice encountered" - ) - p = np.nanpercentile(b, 90, axis=-1, keepdims=True) - - b = np.ma.masked_where(np.isnan(b), b, copy=False) - - p = np.where(np.isnan(p), b.max() + 1, p) - - with np.testing.suppress_warnings() as sup: - sup.filter( - RuntimeWarning, - message=".*invalid value encountered in less", - ) - b = np.ma.where(b < p, np.ma.masked, b) - - b = np.ma.average(b, axis=-1) - b = np.ma.asanyarray(b) - - e = d.mean_of_upper_decile( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "mean_of_upper_decile, axis={}, \ne.mask={}, " - "\nb.mask={}".format(axes, e.mask.array, b.mask), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "mean_of_upper_decile, axis={}, " - "unweighted, masked " - "\ne={}, \nb={}".format(axes, e.array, b), - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_range_mid_range(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for h in ("range", "mid_range"): - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - mn = np.amin(b, axis=-1) - mx = np.amax(b, axis=-1) - if h == "range": - b = mx - mn - elif h == "mid_range": - b = (mx + mn) * 0.5 - - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) - - # unweighted, masked - d = cf.Data(self.ma) - for h in ("range", "mid_range"): - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - mn = np.amin(b, axis=-1) - mx = np.amax(b, axis=-1) - if h == "range": - b = mx - mn - elif h == "mid_range": - b = (mx + mn) * 0.5 - - b = np.ma.asanyarray(b) - - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, " - "\nb.mask={}".format(h, axes, e.mask.array, b.mask), - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked " - "\ne={}, \nb={}".format(h, axes, e.array, b), - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute 'w' for DataTest") - def test_Data_integral(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (True, False): - # unmasked - d = cf.Data(self.a) - x = cf.Data(self.w) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - v = reshape_array(self.w, axes) - b = np.sum(b * v, axis=-1) - - e = d.integral( - axes=axes, squeeze=True, weights=x, _preserve_partitions=pp - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, unmasked \ne={}, \nb={}".format( - axes, e.array, b - ), - ) - - # masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - v = reshape_array(self.w, axes) - b = np.sum(b * v, axis=-1) - b = np.ma.asanyarray(b) - - e = d.integral( - axes=axes, squeeze=True, weights=x, _preserve_partitions=pp - ) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "axis={} masked, \ne.mask={}, " - "\nb.mask={}".format(axes, e.mask.array, b.mask), - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, masked \ne={}, \nb={}".format(axes, e.array, b), - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_sum_of_weights_sum_of_weights2(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for pp in (True, False): - # unweighted, unmasked - d = cf.Data(self.a) - for h in ("sum_of_weights", "sum_of_weights2"): - for axes in self.axes_combinations: - b = reshape_array(self.ones, axes) - b = b.sum(axis=-1) - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked, pp={}, " - "\ne={}, \nb={}".format(h, axes, pp, e.array, b), - ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma) - for a, h in zip( - (self.mones, self.mones), ("sum_of_weights", "sum_of_weights2") - ): - for axes in self.axes_combinations: - b = reshape_array(a, axes) - b = np.ma.asanyarray(b.sum(axis=-1)) - e = getattr(d, h)( - axes=axes, squeeze=True, _preserve_partitions=pp - ) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, unweighted, masked, pp={}, " - "\ne.mask={}, \nb.mask={}".format( - h, axes, pp, e.mask.array, b.mask - ), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked, pp={}, " - "\ne={}, \nb={}".format(h, axes, pp, e.array, b), - ) - # --- End: for - - # weighted, masked - d = cf.Data(self.ma) - x = cf.Data(self.w) - for a, h in zip( - (self.mw, self.mw * self.mw), - ("sum_of_weights", "sum_of_weights2"), - ): - for axes in self.axes_combinations: - a = a.copy() - a.mask = self.ma.mask - b = reshape_array(a, axes) - b = np.ma.asanyarray(b.sum(axis=-1)) - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - _preserve_partitions=pp, - ) - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, " - "\nb.mask={}".format(h, axes, e.mask.array, b.mask), - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, \ne={}, \nb={}".format( - h, axes, e.array, b - ), - ) - # --- End: for - - # weighted, unmasked - d = cf.Data(self.a) - for a, h in zip( - (self.w, self.w * self.w), - ("sum_of_weights", "sum_of_weights2"), - ): - for axes in self.axes_combinations: - b = reshape_array(a, axes) - b = b.sum(axis=-1) - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - _preserve_partitions=pp, - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, \ne={}, \nb={}".format( - h, axes, e.array, b - ), - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_mean_mean_absolute_value(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - for absolute in (False, True): - a = self.a - ma = self.ma - method = "mean" - if absolute: - a = np.absolute(a) - ma = np.absolute(ma) - method = "mean_absolute_value" - - # unweighted, unmasked - d = cf.Data(self.a) - for axes in self.axes_combinations: - b = reshape_array(a, axes) - b = np.mean(b, axis=-1) - e = getattr(d, method)(axes=axes, squeeze=True) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{} axis={}, unweighted, unmasked \ne={}, " - "\nb={}".format(method, axes, e.array, b), - ) - # --- End: for - - # weighted, unmasked - x = cf.Data(self.w) - for axes in self.axes_combinations: - b = reshape_array(a, axes) - v = reshape_array(self.w, axes) - b = np.average(b, axis=-1, weights=v) - - e = getattr(d, method)(axes=axes, weights=x, squeeze=True) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{} weighted, unmasked axis={}, \ne={}, " - "\nb={}".format(method, axes, e.array, b), - ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(ma, axes) - b = np.ma.average(b, axis=-1) - b = np.ma.asanyarray(b) - - e = getattr(d, method)(axes=axes, squeeze=True) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{} unweighted, masked axis={}, \ne.mask={}, " - "\nb.mask={}".format(method, axes, e.mask.array, b.mask), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{} unweighted, masked axis={}, \ne={}, " - "\nb={}, ".format(method, axes, e.array, b), - ) - # --- End: for - - # weighted, masked - for axes in self.axes_combinations: - b = reshape_array(ma, axes) - v = reshape_array(self.mw, axes) - b = np.ma.average(b, axis=-1, weights=v) - b = np.ma.asanyarray(b) - - e = getattr(d, method)(axes=axes, weights=x, squeeze=True) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{} weighted, masked axis={}, \ne.mask={}, " - "\nb.mask={}".format(method, axes, e.mask.array, b.mask), - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{} weighted, masked axis={}, \ne={}, " - "\nb={}, ".format(method, axes, e.array, b), - ) - # --- End: for - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_root_mean_square(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - # unweighted, unmasked - d = cf.Data(self.a) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) ** 2 - b = np.mean(b, axis=-1) ** 0.5 - e = d.root_mean_square(axes=axes, squeeze=True) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, unweighted, unmasked \ne={}, " - "\nb={}".format(axes, e.array, b), - ) - # --- End: for - - # weighted, unmasked - x = cf.Data(self.w) - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) ** 2 - v = reshape_array(self.w, axes) - b = np.average(b, axis=-1, weights=v) ** 0.5 - - e = d.root_mean_square(axes=axes, weights=x, squeeze=True) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, weighted, unmasked \ne={}, " - "\nb={}".format(axes, e.array, b), - ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) ** 2 - b = np.ma.average(b, axis=-1) - b = np.ma.asanyarray(b) ** 0.5 - - e = d.root_mean_square(axes=axes, squeeze=True) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "axis={}, unweighted, masked \ne.mask={}, " - "\nb.mask={}, ".format(axes, e.mask.array, b.mask), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, unweighted, masked \ne={}, " - "\nb={}, ".format(axes, e.array, b), - ) - # --- End: for - - # weighted, masked - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) ** 2 - v = reshape_array(self.mw, axes) - b = np.ma.average(b, axis=-1, weights=v) - b = np.ma.asanyarray(b) ** 0.5 - - e = d.root_mean_square(axes=axes, weights=x, squeeze=True) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "axis={}, weighted, masked \ne.mask={}, " - "\nb.mask={}, ".format(axes, e.mask.array, b.mask), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, weighted, masked \ne={}, \nb={}, ".format( - axes, e.array, b - ), - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") - def test_Data_sample_size(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - # unmasked - d = cf.Data(self.a) - for axes in self.axes_combinations: - b = reshape_array(self.ones, axes) - b = b.sum(axis=-1) - e = d.sample_size(axes=axes, squeeze=True) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, \ne={}, \nb={}".format(axes, e.array, b), - ) - # --- End: for - - # masked - d = cf.Data(self.ma) - for axes in self.axes_combinations: - b = reshape_array(self.mones, axes) - b = b.sum(axis=-1) - e = d.sample_size(axes=axes, squeeze=True) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "axis={}, \ne={}, \nb={}".format(axes, e.array, b), - ) - - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'axes_combinations'") - def test_Data_sd_var(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - ddofs = (0, 1) - - for pp in (False, True): - # unweighted, unmasked - d = cf.Data(self.a, units="K") - for _np, h in zip((np.var, np.std), ("var", "sd")): - for ddof in ddofs: - for axes in self.axes_combinations: - b = reshape_array(self.a, axes) - b = _np(b, axis=-1, ddof=ddof) - e = getattr(d, h)( - axes=axes, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, unmasked pp={}, " - "\ne={}, \nb={}".format(h, axes, pp, e.array, b), - ) - # --- End: for - - # unweighted, masked - d = cf.Data(self.ma, units="K") - for _np, h in zip((np.ma.var, np.ma.std), ("var", "sd")): - for ddof in ddofs: - for axes in self.axes_combinations: - b = reshape_array(self.ma, axes) - b = _np(b, axis=-1, ddof=ddof) - e = getattr(d, h)( - axes=axes, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, unweighted, masked, pp={}, " - "\ne={}, \nb={}".format(h, axes, pp, e.array, b), - ) - # --- End: for - - # weighted, unmasked - d = cf.Data(self.a, units="K") - x = cf.Data(self.w) - for h in ("var", "sd"): - for axes in self.axes_combinations: - for ddof in (0, 1): - b = reshape_array(self.a, axes) - v = reshape_array(self.w, axes) - - avg = np.average(b, axis=-1, weights=v) - if np.ndim(avg) < b.ndim: - avg = np.expand_dims(avg, -1) - - b, V1 = np.average( - (b - avg) ** 2, axis=-1, weights=v, returned=True - ) - - if ddof == 1: - # Calculate the weighted unbiased - # variance. The unbiased variance - # weighted with _reliability_ weights - # is [V1**2/(V1**2-V2)]*var. - V2 = np.asanyarray((v * v).sum(axis=-1)) - b *= V1 * V1 / (V1 * V1 - V2) - elif ddof == 0: - pass - - if h == "sd": - b **= 0.5 - - b = np.ma.asanyarray(b) - - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, weighted, unmasked, pp={}, " - "ddof={}, \ne={}, \nb={}".format( - h, axes, pp, ddof, e.array, b - ), - ) - # --- End: for - - # weighted, masked - d = cf.Data(self.ma, units="K") - x = cf.Data(self.w) - for h in ("var", "sd"): - for axes in self.axes_combinations: - for ddof in (0, 1): - b = reshape_array(self.ma, axes) - v = reshape_array(self.mw, axes) - - not_enough_data = np.ma.count(b, axis=-1) <= ddof - - avg = np.ma.average(b, axis=-1, weights=v) - if np.ndim(avg) < b.ndim: - avg = np.expand_dims(avg, -1) - - b, V1 = np.ma.average( - (b - avg) ** 2, axis=-1, weights=v, returned=True - ) - - b = np.ma.where(not_enough_data, np.ma.masked, b) - - if ddof == 1: - # Calculate the weighted unbiased - # variance. The unbiased variance - # weighted with _reliability_ weights - # is [V1**2/(V1**2-V2)]*var. - V2 = np.asanyarray((v * v).sum(axis=-1)) - b *= V1 * V1 / (V1 * V1 - V2) - elif ddof == 0: - pass - - if h == "sd": - b **= 0.5 - - e = getattr(d, h)( - axes=axes, - weights=x, - squeeze=True, - ddof=ddof, - _preserve_partitions=pp, - ) - - if h == "sd": - self.assertEqual(e.Units, d.Units) - else: - self.assertEqual(e.Units, d.Units ** 2) - - self.assertTrue( - (e.mask.array == b.mask).all(), - "{}, axis={}, \ne.mask={}, " - "\nb.mask={}, ".format( - h, axes, e.mask.array, b.mask - ), - ) - self.assertTrue( - e.allclose(b, rtol=1e-05, atol=1e-08), - "{}, axis={}, weighted, masked, pp={}, " - "ddof={}, \ne={}, \nb={}".format( - h, axes, pp, ddof, e.array, b - ), - ) - # --- End: for - @unittest.skipIf(TEST_DASKIFIED_ONLY, "hits unexpected kwarg 'select'") def test_Data_dumpd_loadd_dumps(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: @@ -3912,6 +3095,690 @@ def test_Data_rechunk(self): self.assertEqual(e.chunks, ((4,), (5,))) self.assertTrue(e.equals(d)) + def test_Data_reshape(self): + a = self.ma + d = cf.Data(a) + + self.assertIsNone(d.reshape(*d.shape, inplace=True)) + self.assertEqual(d.shape, a.shape) + + for original_shape, new_shape, chunks in ( + ((10,), (10,), (3, 3, 4)), + ((10,), (10, 1, 1), 5), + ((10,), (1, 10), 5), + ((24,), (2, 3, 4), 12), + ((1, 24), (2, 3, 4), 12), + ((2, 3, 4), (24,), (1, 3, 4)), + ((2, 3, 4), (24,), 4), + ((2, 3, 4), (24, 1), 4), + ((2, 3, 4), (1, 24), 4), + ((4, 4, 1), (4, 4), 2), + ((4, 4), (4, 4, 1), 2), + ((1, 4, 4), (4, 4), 2), + ((1, 4, 4), (4, 4, 1), 2), + ((1, 4, 4), (1, 1, 4, 4), 2), + ((4, 4), (1, 4, 4, 1), 2), + ((4, 4), (1, 4, 4), 2), + ((2, 3), (2, 3), (1, 2)), + ((2, 3), (3, 2), 3), + ((4, 2, 3), (4, 6), 4), + ((3, 4, 5, 6), (3, 4, 5, 6), (2, 3, 4, 5)), + ((), (1,), 1), + ((1,), (), 1), + ((24,), (3, 8), 24), + ((24,), (4, 6), 6), + ((24,), (4, 3, 2), 6), + ((24,), (4, 6, 1), 6), + ((24,), (4, 6), (6, 12, 6)), + ((64, 4), (8, 8, 4), (16, 2)), + ((4, 64), (4, 8, 4, 2), (2, 16)), + ((4, 8, 4, 2), (2, 1, 2, 32, 2), (2, 4, 2, 2)), + ((4, 1, 4), (4, 4), (2, 1, 2)), + ((0, 10), (0, 5, 2), (5, 5)), + ((5, 0, 2), (0, 10), (5, 2, 2)), + ((0,), (2, 0, 2), (4,)), + ((2, 0, 2), (0,), (4, 4, 4)), + ((2, 3, 4), -1, -1), + ): + a = np.random.randint(10, size=original_shape) + d = cf.Data(a, chunks=chunks) + + a = a.reshape(new_shape) + d = d.reshape(new_shape) + + self.assertEqual(d.shape, a.shape) + self.assertTrue((d.array == a).all()) + + def test_Data_square(self): + a = self.ma.astype(float) + asquare = np.square(a) + + d = cf.Data(a) + self.assertIsNone(d.square(inplace=True)) + self.assertTrue((d.array == asquare).all()) + self.assertEqual(d.Units, cf.Units()) + + d = cf.Data(a, "m") + e = d.square() + self.assertEqual(e.dtype, asquare.dtype) + self.assertTrue((e.array == asquare).all()) + self.assertEqual(e.Units, cf.Units("m2")) + + asquare = np.square(a, dtype="float32") + e = d.square(dtype="float32") + self.assertEqual(e.dtype, asquare.dtype) + self.assertTrue((e.array == asquare).all()) + + def test_Data_sqrt(self): + a = self.ma.astype(float) + asqrt = np.sqrt(a) + + d = cf.Data(a) + self.assertIsNone(d.sqrt(inplace=True)) + self.assertTrue((d.array == asqrt).all()) + self.assertEqual(d.Units, cf.Units()) + + d = cf.Data(a, "m2") + e = d.sqrt() + self.assertEqual(e.dtype, asqrt.dtype) + self.assertTrue((e.array == asqrt).all()) + self.assertEqual(e.Units, cf.Units("m")) + + asqrt = np.sqrt(a, dtype="float32") + e = d.sqrt(dtype="float32") + self.assertEqual(e.dtype, asqrt.dtype) + self.assertTrue((e.array == asqrt).all()) + + # Incompatible units + d = cf.Data(a, "m") + with self.assertRaises(ValueError): + d.sqrt() + + def test_Data_integral(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.sum(b * w, axis=-1) + b = np.ma.asanyarray(b) + + e = d.integral(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_max(self): + # Masked array + a = self.ma + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.max(b, axis=-1) + b = np.ma.asanyarray(b) + + e = d.max(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_maximum_absolute_value(self): + # Masked array + a = self.ma + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.max(abs(b), axis=-1) + b = np.ma.asanyarray(b) + + e = d.maximum_absolute_value(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_mean(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.ma.average(b, axis=-1, weights=w) + b = np.ma.asanyarray(b) + + e = d.mean(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_mean_absolute_value(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.ma.average(abs(b), axis=-1, weights=w) + b = np.ma.asanyarray(b) + + e = d.mean_absolute_value(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_mid_range(self): + # Masked array, non-masked weights + a = self.ma + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = (np.max(b, axis=-1) + np.min(b, axis=-1)) / 2.0 + b = np.ma.asanyarray(b) + + e = d.mid_range(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + with self.assertRaises(TypeError): + cf.Data([0, 1], dtype=bool).mid_range() + + def test_Data_min(self): + # Masked array + a = self.ma + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.min(b, axis=-1) + b = np.ma.asanyarray(b) + + e = d.min(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_minimum_absolute_value(self): + # Masked array + a = self.ma + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.min(abs(b), axis=-1) + b = np.ma.asanyarray(b) + + e = d.minimum_absolute_value(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_range(self): + # Masked array + a = self.ma + + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.max(b, axis=-1) - np.min(b, axis=-1) + b = np.ma.asanyarray(b) + + e = d.range(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + with self.assertRaises(TypeError): + cf.Data([0, 1], dtype=bool).range() + + def test_Data_root_mean_square(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.ma.average(b * b, axis=-1, weights=w) ** 0.5 + b = np.ma.asanyarray(b) + + e = d.root_mean_square(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_sample_size(self): + # Masked array + a = self.ma + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.sum(np.ones_like(b), axis=-1) + b = np.ma.asanyarray(b) + + e = d.sample_size(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + # Non-masked array + a = self.a + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.sum(np.ones_like(b), axis=-1) + b = np.asanyarray(b) + + e = d.sample_size(axes=axis, squeeze=True) + e = np.array(e.array) + + self.assertTrue(np.allclose(e, b)) + + def test_Data_std(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + std = d.std(weights=weights, ddof=1) + var = d.var(weights=weights, ddof=1) + + self.assertTrue(std.equals(var.sqrt())) + + def test_Data_sum(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.sum(b * w, axis=-1) + b = np.ma.asanyarray(b) + + e = d.sum(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_sum_of_squares(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.sum(b * b * w, axis=-1) + b = np.ma.asanyarray(b) + + e = d.sum_of_squares(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_sum_of_weights(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + # Weights=None + for axis in axis_combinations(a): + b = reshape_array(a, axis) + b = np.sum(np.ones_like(b), axis=-1) + b = np.ma.asanyarray(b) + + e = d.sum_of_weights(axes=axis, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + w = np.ma.masked_where(b.mask, w) + b = np.sum(w, axis=-1) + b = np.ma.asanyarray(b) + + e = d.sum_of_weights(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_sum_of_weights2(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + # Weights=None + for axis in axis_combinations(a): + e = d.sum_of_weights2(axes=axis) + f = d.sum_of_weights(axes=axis) + self.assertTrue(e.equals(f)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + w = np.ma.masked_where(b.mask, w) + b = np.sum(w * w, axis=-1) + b = np.ma.asanyarray(b) + + e = d.sum_of_weights2(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_var(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + # Weighted ddof = 0 + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + mu, V1 = np.ma.average(b, axis=-1, weights=w, returned=True) + mu = mu.reshape(mu.shape + (1,)) + w = np.ma.masked_where(b.mask, w) + + b = np.sum(w * (b - mu) ** 2, axis=-1) + b = b / V1 + b = np.ma.asanyarray(b) + + e = d.var(axes=axis, weights=weights, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b), f"e={e}\nb={b}\ne-b={e-b}") + + # Weighted ddof = 1 + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + mu, V1 = np.ma.average(b, axis=-1, weights=w, returned=True) + mu = mu.reshape(mu.shape + (1,)) + w = np.ma.masked_where(b.mask, w) + V2 = np.sum(w * w, axis=-1) + + b = np.sum(w * (b - mu) ** 2, axis=-1) + b = b / (V1 - (V2 / V1)) + b = np.ma.asanyarray(b) + + e = d.var(axes=axis, weights=weights, ddof=1, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + # Unweighted ddof = 1 + for axis in axis_combinations(a): + b = reshape_array(a, axis) + mu, V1 = np.ma.average(b, axis=-1, returned=True) + mu = mu.reshape(mu.shape + (1,)) + + b = np.sum((b - mu) ** 2, axis=-1) + b = b / (V1 - 1) + b = np.ma.asanyarray(b) + + e = d.var(axes=axis, ddof=1, squeeze=True) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + @unittest.skipIf(TEST_DASKIFIED_ONLY, "Needs __lt__ and __le__") + def test_Data_mean_of_upper_decile(self): + # Masked array, non-masked weights + a = self.ma + weights = self.w + d = cf.Data(a, "K", chunks=(2, 3, 2, 5)) + + for axis in axis_combinations(a): + b = reshape_array(a, axis) + w = reshape_array(weights, axis) + b = np.ma.filled(b, np.nan) + with np.testing.suppress_warnings() as sup: + sup.filter( + RuntimeWarning, message=".*All-NaN slice encountered" + ) + p = np.nanpercentile(b, 90, axis=-1, keepdims=True) + + b = np.ma.masked_where(np.isnan(b), b, copy=False) + p = np.where(np.isnan(p), b.max() + 1, p) + + with np.testing.suppress_warnings() as sup: + sup.filter( + RuntimeWarning, + message=".*invalid value encountered in less", + ) + b = np.ma.where(b < p, np.ma.masked, b) + + b = np.ma.average(b, axis=-1, weights=w) + b = np.ma.asanyarray(b) + + e = d.mean_of_upper_decile( + axes=axis, weights=weights, squeeze=True + ) + e = np.ma.array(e.array) + + self.assertTrue((e.mask == b.mask).all()) + self.assertTrue(np.allclose(e, b)) + + def test_Data_collapse_mtol(self): + # Data with exactly half of its elements masked + d = cf.Data(np.arange(6), "K", mask=[0, 1, 0, 1, 0, 1], chunks=2) + + for func in ( + d.integral, + d.mean, + d.mean_absolute_value, + d.median, + d.min, + d.mid_range, + d.minimum_absolute_value, + d.max, + d.maximum_absolute_value, + d.range, + d.root_mean_square, + d.sample_size, + d.std, + d.sum, + d.sum_of_squares, + d.sum_of_weights, + d.sum_of_weights2, + d.var, + ): + self.assertTrue(func(mtol=0.4).array.mask) + self.assertFalse(func(mtol=0.5).array.mask) + + # TODODASK - add in mean_of_upper_decile when it's daskified + + def test_Data_collapse_units(self): + d = cf.Data([1, 2], "K") + + self.assertEqual(d.sample_size().Units, cf.Units()) + + for func in ( + d.integral, + d.mean, + d.mean_absolute_value, + d.median, + d.min, + d.mid_range, + d.minimum_absolute_value, + d.max, + d.maximum_absolute_value, + d.range, + d.root_mean_square, + d.std, + d.sum, + ): + self.assertEqual(func().Units, d.Units) + + for func in ( + d.sum_of_squares, + d.var, + ): + self.assertEqual(func().Units, d.Units ** 2) + + for func in ( + d.sum_of_weights, + d.sum_of_weights2, + ): + self.assertEqual(func().Units, cf.Units()) + + # Weighted + w = cf.Data(1, "m") + self.assertEqual(d.integral(weights=w).Units, d.Units * w.Units) + self.assertEqual(d.sum_of_weights(weights=w).Units, w.Units) + self.assertEqual(d.sum_of_weights2(weights=w).Units, w.Units ** 2) + + # Dimensionless data + d = cf.Data([1, 2]) + self.assertEqual(d.integral(weights=w).Units, w.Units) + + for func in ( + d.sum_of_squares, + d.var, + ): + self.assertEqual(func().Units, cf.Units()) + + # TODODASK - add in mean_of_upper_decile when it's daskified + + def test_Data_collapse_keepdims(self): + d = cf.Data(np.arange(6).reshape(2, 3)) + + for func in ( + d.integral, + d.mean, + d.mean_absolute_value, + d.median, + d.min, + d.mid_range, + d.minimum_absolute_value, + d.max, + d.maximum_absolute_value, + d.range, + d.root_mean_square, + d.sample_size, + d.std, + d.sum, + d.sum_of_squares, + d.sum_of_weights, + d.sum_of_weights2, + d.var, + ): + for axis in axis_combinations(d): + e = func(axes=axis, squeeze=False) + s = [1 if i in axis else n for i, n in enumerate(d.shape)] + self.assertEqual(e.shape, tuple(s)) + + for axis in axis_combinations(d): + e = func(axes=axis, squeeze=True) + s = [n for i, n in enumerate(d.shape) if i not in axis] + self.assertEqual(e.shape, tuple(s)) + + # TODODASK - add in mean_of_upper_decile + + def test_Data_collapse_dtype(self): + d = cf.Data([1, 2, 3, 4], dtype="i4", chunks=2) + e = cf.Data([1.0, 2, 3, 4], dtype="f4", chunks=2) + self.assertTrue(d.dtype, "i4") + self.assertTrue(e.dtype, "f4") + + # Cases for which both d and e collapse to a result of the + # same data type + for x, r in zip((d, e), ("i4", "f4")): + for func in ( + x.min, + x.minimum_absolute_value, + x.max, + x.maximum_absolute_value, + x.range, + ): + self.assertEqual(func().dtype, r) + + # Cases for which both d and e collapse to a result of the + # double of same data type + for x, r in zip((d, e), ("i8", "f8")): + for func in ( + x.integral, + x.sum, + x.sum_of_squares, + ): + self.assertEqual(func().dtype, r) + + # Cases for which both d and e collapse to a result of double + # float data type + for x, r in zip((d, e), ("f8", "f8")): + for func in ( + x.mean, + x.mean_absolute_value, + x.median, + x.mid_range, + x.root_mean_square, + x.std, + x.var, + ): + self.assertEqual(func().dtype, r) + + x = d + for func in ( + x.sum_of_weights, + x.sum_of_weights2, + ): + self.assertEqual(func().dtype, "i8") + + # Weights + w_int = cf.Data(1, dtype="i4") + w_float = cf.Data(1.0, dtype="f4") + for w, r in zip((w_int, w_float), ("i8", "f8")): + for func in ( + d.integral, + d.sum, + d.sum_of_squares, + d.sum_of_weights, + d.sum_of_weights2, + ): + self.assertTrue(func(weights=w).dtype, r) + + # TODODASK - add in mean_of_upper_decile + def test_Data_get_units(self): for units in ("", "m", "days since 2000-01-01"): d = cf.Data(1, units) @@ -3967,6 +3834,20 @@ def test_Data_tolist(self): self.assertEqual(e, np.array(x).tolist()) self.assertTrue(d.equals(cf.Data(e))) + def test_Data_uncompress(self): + import cfdm + + f = cfdm.read("DSG_timeSeries_contiguous.nc")[0] + a = f.data.array + d = cf.Data(cf.RaggedContiguousArray(source=f.data.source())) + + self.assertTrue(d.get_compression_type()) + self.assertTrue((d.array == a).all()) + + self.assertIsNone(d.uncompress(inplace=True)) + self.assertFalse(d.get_compression_type()) + self.assertTrue((d.array == a).all()) + def test_Data_data(self): for d in [ cf.Data(1), @@ -3985,6 +3866,14 @@ def test_Data_dump(self): ) self.assertEqual(d.dump(display=False), x) + def test_Data_fill_value(self): + d = cf.Data([1, 2], "m") + self.assertIsNone(d.fill_value) + d.fill_value = 999 + self.assertEqual(d.fill_value, 999) + del d.fill_value + self.assertIsNone(d.fill_value) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/docs/Makefile b/docs/Makefile index e101c0f118..6850180e1d 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -2,7 +2,7 @@ # # You can set these variables from the command line. -SPHINXOPTS = -j 2 +SPHINXOPTS = -j 1 SPHINXBUILD = sphinx-build PAPER = #BUILDDIR = build diff --git a/docs/source/field_analysis.rst b/docs/source/field_analysis.rst index 3ea43360f5..fe2b323451 100644 --- a/docs/source/field_analysis.rst +++ b/docs/source/field_analysis.rst @@ -125,90 +125,123 @@ Collapse methods ^^^^^^^^^^^^^^^^ The following collapse methods are available, over any subset of the -domain axes. The "Cell method" column in the table gives the method of -the new cell method construct (if one is created). +domain axes. The "Cell method" column in the table gives the method +name, defined by the CF conventions, of the new cell method construct +(if one is created). ============================ ======================================== ========================== Method Description Cell method ============================ ======================================== ========================== -``'maximum'`` The maximum of the values. ``maximum`` +``'sample_size'`` The sample size, :math:`N`, equal to the ``point`` + number of non-missing values. + +``'maximum'`` The maximum of the non-missing values. ``maximum`` + + .. math:: m_x=\max\{x_0, \ldots, x_N\} -``'minimum'`` The minimum of the values. ``minimum`` +``'minimum'`` The minimum of the non-missing values. ``minimum`` + + .. math:: m_n=\min\{x_0, \ldots, x_N\} -``'maximum_absolute_value'`` The maximum of the absolute values. ``maximum_absolute_value`` +``'maximum_absolute_value'`` The maximum of the non-missing absolute ``maximum_absolute_value`` + values. + + .. math:: \max\{|x_0|, \ldots, |x_N|\} -``'minimum_absolute_value'`` The minimum of the absolute values. ``minimum_absolute_value`` +``'minimum_absolute_value'`` The minimum of the non-missing absolute ``minimum_absolute_value`` + values. + + .. math:: \min\{|x_0|, \ldots, |x_N|\} ``'mid_range'`` The average of the maximum and the ``mid_range`` - minimum of the values. + minimum of the non-missing values. + + .. math:: \frac{m_x + m_n}{2} ``'range'`` The absolute difference between the ``range`` - maximum and the minimum of the values. + maximum and the minimum of the + non-missing values. + + .. math:: m_x - m_n -``'median'`` The median of the values. ``median`` +``'median'`` The median of the :math:`N` non-missing ``median`` + values. -``'sample_size'`` The sample size, :math:`N`, as would be ``point`` - used for other calculations, i.e. the - number of non-missing values. - -``'sum_of_weights'`` The sum of :math:`N` weights ``sum`` - :math:`w_i`, as would be used for other - calculations, is +``'sum_of_weights'`` The sum of the :math:`N` weights ``sum`` + :math:`w_i` that correspond to + non-missing values. .. math:: V_{1}=\sum_{i=1}^{N} w_i -``'sum_of_weights2'`` The sum of the squares of :math:`N` ``sum`` - weights :math:`w_i`, as would be used - for other calculations, is +``'sum_of_weights2'`` The sum of the squares of the :math:`N` ``sum`` + weights :math:`w_i` that correspond to + non-missing values. - .. math:: V_{2}=\sum_{i=1}^{N} w_i^{2} + .. math:: V_{2}=\sum_{i=1}^{N} w_i^{2} -``'sum'`` The unweighted sum of :math:`N` values ``sum`` - :math:`x_i` is +``'sum'`` The unweighted sum of the :math:`N` ``sum`` + non-missing values :math:`x_i` is .. math:: t=\sum_{i=1}^{N} x_i -``'sum_of_squares'`` The unweighted sum of the squares of ``sum_of_squares`` - :math:`N` values :math:`x_i` is + The weighted sum of the :math:`N` + non-missing values :math:`x_i` with + corresponding weights :math:`w_i` is + + .. math:: \hat{t}=\sum_{i=1}^{N} w_i x_i + +``'sum_of_squares'`` The unweighted sum of the squares of the ``sum_of_squares`` + :math:`N` non-missing values :math:`x_i` + is .. math:: t_2=\sum_{i=1}^{N} x_{i}^{2} -``'integral'`` The integral of :math:`N` values ``sum`` - :math:`x_i` with corresponding cell - measures :math:`m_i` is + The weighted sum of the squares of the + :math:`N` non-missing values :math:`x_i` + with corresponding weights :math:`w_i` + is + + .. math:: \hat{t}_2=\sum_{i=1}^{N} + w_i x_{i}^{2} + +``'integral'`` The :ref:`weighted ` ``sum`` + sum of the :math:`N` non-missing values + :math:`x_i` with corresponding cell + measures :math:`m_i`. .. math:: i=\sum_{i=1}^{N} m_i x_i - Note that the integral differs from a - weighted sum in that the units of the - cell measures are incorporated into the - result. + .. note:: The integral differs from a + weighted sum in that the units + of the measures are + incorporated into the result. -``'mean'`` The unweighted mean of :math:`N` values ``mean`` - :math:`x_i` is +``'mean'`` The unweighted mean of the :math:`N` ``mean`` + non-missing values :math:`x_i` is .. math:: \mu=\frac{1}{N}\sum_{i=1}^{N} x_i The :ref:`weighted ` - mean of :math:`N` values :math:`x_i` - with corresponding weights :math:`w_i` - is + mean of the :math:`N` non-missing values + :math:`x_i` with corresponding weights + :math:`w_i`. is .. math:: \hat{\mu}=\frac{1}{V_{1}} \sum_{i=1}^{N} - w_i x_i + w_i x_i -``'mean_absolute_value'`` The unweighted mean of :math:`N` ``mean_absolute_value`` - values :math:`x_i` absoluted is +``'mean_absolute_value'`` The unweighted mean of the :math:`N` ``mean_absolute_value`` + non-missing absolute values :math:`x_i` + is .. math:: \mu_{abs}=\frac{1}{N} \sum_{i=1}^{N}|x_i| The :ref:`weighted ` - mean of :math:`N` values :math:`x_i` - absoluted with corresponding weights - :math:`w_i` is + mean of the :math:`N` non-missing + absolute values :math:`x_i` with + corresponding weights :math:`w_i` is .. math:: \hat{\mu}_{abs}= \frac{1}{V_{1}} @@ -218,8 +251,8 @@ Method Description Cell met upper group of data values defined by the upper tenth of their distribution -``'variance'`` The unweighted variance of :math:`N` ``variance`` - values :math:`x_i` and with +``'variance'`` The unweighted variance of the :math:`N` ``variance`` + non-missing values :math:`x_i` and with :math:`N-ddof` degrees of freedom (:math:`ddof\ge0`) is @@ -236,9 +269,10 @@ Method Description Cell met :math:`ddof=1`. The :ref:`weighted ` - biased estimate of the variance of - :math:`N` values :math:`x_i` with - corresponding weights :math:`w_i` is + biased estimate of the variance of the + :math:`N` non-missing values :math:`x_i` + with corresponding weights :math:`w_i` + is .. math:: \hat{s}_{N}^{2}= \frac{1}{V_{1}} @@ -246,9 +280,11 @@ Method Description Cell met w_i(x_i - \hat{\mu})^{2} - The corresponding :ref:`weighted - ` unbiased estimate of - the variance is + The :ref:`weighted ` + unbiased estimate of the variance of the + :math:`N` non-missing values :math:`x_i` + with corresponding weights :math:`w_i` + is .. math:: \hat{s}^{2}=\frac{1}{V_{1} - (V_{1}/V_{2})} @@ -256,28 +292,31 @@ Method Description Cell met w_i(x_i - \hat{\mu})^{2} - In both cases, the weights are assumed - to be non-random reliability weights, as - opposed to frequency weights. + .. note:: The weights used in the + variance calculations are + assumed to be non-random + reliability weights, as + opposed to frequency weights. ``'standard_deviation'`` The standard deviation is the square ``standard_deviation`` root of the unweighted or :ref:`weighted ` - variance, as defined in this table. + variance. -``'root_mean_square'`` The unweighted root mean square of ``root_mean_square`` - :math:`N` values :math:`x_i` is +``'root_mean_square'`` The unweighted root mean square of the ``root_mean_square`` + :math:`N` non-missing values + :math:`x_i` is - .. math:: RMS=\sqrt{\frac{1}{N} + .. math:: r=\sqrt{\frac{1}{N} \sum_{i=1}^{N} x_{i}^2} The :ref:`weighted ` - root mean square of :math:`N` values - :math:`x_i` with corresponding weights - :math:`w_i` is + root mean square of the :math:`N` + non-missing values :math:`x_i` with + corresponding weights :math:`w_i` is - .. math:: \hat{RMS}=\sqrt{ + .. math:: \hat{r}=\sqrt{ \frac{1}{V_{1}} \sum_{i=1}^{N} w_i x_{i}^2} diff --git a/requirements.txt b/requirements.txt index 4b7bb69d27..49aa387a59 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ numpy>=1.22 cfdm>=1.9.1.0, <1.9.2.0 psutil>=0.6.0 cfunits>=3.3.4 +dask>=2022.03.0 From 6a93fe867722f4b9caece8fe1ba3aa06c82e7ca4 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 22 Apr 2022 08:34:49 +0100 Subject: [PATCH 3/9] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index 13c9847f37..cc14d294a6 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -386,7 +386,7 @@ def __init__( independent reads for every dask chunk, each time the data are computed. - In geneal, setting *to_memory* to True is not the same + In general, setting *to_memory* to True is not the same as calling the `persist` of the newly created `Data` object, which also decompresses data compressed by convention and computes any data type, mask and From 33696fab9cc5774e9bac114f9cba5a5403b588e3 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 22 Apr 2022 08:35:33 +0100 Subject: [PATCH 4/9] Remove superflous text Co-authored-by: Sadie L. Bartholomew --- cf/data/mixin/deprecations.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py index fd2661ae4a..4bcaa6f486 100644 --- a/cf/data/mixin/deprecations.py +++ b/cf/data/mixin/deprecations.py @@ -112,8 +112,6 @@ def in_memory(self): Deprecated at version TODODASK. - :Returns: - **Examples** >>> d.in_memory From 59328f78c5fb466ccaab779b45d163e4f248bf6c Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 22 Apr 2022 08:35:49 +0100 Subject: [PATCH 5/9] Remove superfluous text Co-authored-by: Sadie L. Bartholomew --- cf/data/mixin/deprecations.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py index 4bcaa6f486..e07027099a 100644 --- a/cf/data/mixin/deprecations.py +++ b/cf/data/mixin/deprecations.py @@ -111,7 +111,6 @@ def in_memory(self): """True if the array is retained in memory. Deprecated at version TODODASK. - **Examples** >>> d.in_memory From 9274f58e57753846148e8766239147678fb201fc Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 22 Apr 2022 08:36:02 +0100 Subject: [PATCH 6/9] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/mixin/deprecations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py index e07027099a..c5999cc1fa 100644 --- a/cf/data/mixin/deprecations.py +++ b/cf/data/mixin/deprecations.py @@ -418,7 +418,7 @@ def to_disk(self): Deprecated at version TODODASK. - There is no change to partition's whose sub-arrays are already + There is no change to partitions whose sub-arrays are already on disk. :Returns: From 99cffd46d369a8df0cbcdd155650541b5eb1e1d4 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 22 Apr 2022 08:45:58 +0100 Subject: [PATCH 7/9] fix deprecation error attribute --- cf/data/mixin/deprecations.py | 3 --- cf/functions.py | 5 ++--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py index c5999cc1fa..e8128fb9cd 100644 --- a/cf/data/mixin/deprecations.py +++ b/cf/data/mixin/deprecations.py @@ -111,9 +111,6 @@ def in_memory(self): """True if the array is retained in memory. Deprecated at version TODODASK. - **Examples** - - >>> d.in_memory """ _DEPRECATION_ERROR_ATTRIBUTE( diff --git a/cf/functions.py b/cf/functions.py index 4c8355a38f..3a201ec53c 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -3204,11 +3204,10 @@ def _DEPRECATION_ERROR_METHOD( def _DEPRECATION_ERROR_ATTRIBUTE( instance, attribute, message="", version="3.0.0", removed_at="4.0.0" ): - warnings.warn( + raise DeprecationError( f"{instance.__class__.__name__} attribute {attribute!r} has been " f"deprecated at version {version} and will be removed at version " - f"{removed_at}. {message}", - DeprecationWarning, + f"{removed_at}. {message}" ) From 48e20fa12a2de01c9002b0d12d74e5ec893ced15 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 22 Apr 2022 15:50:29 +0100 Subject: [PATCH 8/9] Data.to_memory --- cf/data/data.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cf/data/data.py b/cf/data/data.py index cc14d294a6..dce8023ffa 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -11573,6 +11573,18 @@ def tolist(self): """ return self.array.tolist() + @daskified(_DASKIFIED_VERBOSE) + def to_memory(self): + """Bring data on disk into memory. + + Not implemented. Consider using `persist` instead. + + """ + raise NotImplementedError( + "'Data.to_memory' is not available. " + "Consider using 'Data.persist' instead." + ) + @daskified(_DASKIFIED_VERBOSE) @_deprecated_kwarg_check("i") @_inplace_enabled(default=False) From 7aecaca9ccd3c1a36814c2d0169b46069cd689cf Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 22 Apr 2022 16:13:09 +0100 Subject: [PATCH 9/9] Data.to_memory --- cf/data/mixin/deprecations.py | 40 ----------------------------------- 1 file changed, 40 deletions(-) diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py index e8128fb9cd..9d5a23b69f 100644 --- a/cf/data/mixin/deprecations.py +++ b/cf/data/mixin/deprecations.py @@ -433,43 +433,3 @@ def to_disk(self): version="TODODASK", removed_at="5.0.0", ) # pragma: no cover - - def to_memory(self, regardless=False, parallelise=False): - """Store each partition's data in memory in place if the master - array is smaller than the chunk size. - - Deprecated at version TODODASK. Consider using `persist` - instead. - - There is no change to partitions with data that are already in - memory. - - :Parameters: - - regardless: `bool`, optional - If True then store all partitions' data in memory - regardless of the size of the master array. By default - only store all partitions' data in memory if the - master array is smaller than the chunk size. - - parallelise: `bool`, optional - If True than only move those partitions to memory that - are flagged for processing on this rank. - - :Returns: - - `None` - - **Examples** - - >>> d.to_memory() - >>> d.to_memory(regardless=True) - - """ - _DEPRECATION_ERROR_METHOD( - self, - "to_memory", - message="Consider using 'persist' instead.", - version="TODODASK", - removed_at="5.0.0", - ) # pragma: no cover