diff --git a/cf/__init__.py b/cf/__init__.py index 4d421c3ab3..fda9409f88 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -150,7 +150,7 @@ ) # Check the version of cftime -_minimum_vn = "1.5.0" +_minimum_vn = "1.6.0" if LooseVersion(cftime.__version__) < LooseVersion(_minimum_vn): raise RuntimeError( f"Bad cftime version: cf requires cftime>={_minimum_vn}. " diff --git a/cf/cfdatetime.py b/cf/cfdatetime.py index a22de7a406..27102383e0 100644 --- a/cf/cfdatetime.py +++ b/cf/cfdatetime.py @@ -6,7 +6,7 @@ from .functions import _DEPRECATION_ERROR_CLASS -_default_calendar = "gregorian" +default_calendar = "gregorian" # -------------------------------------------------------------------- # Mapping of CF calendars to cftime date-time objects @@ -21,6 +21,21 @@ ("julian",): cftime.DatetimeJulian, } +canonical_calendar = { + None: "standard", + "gregorian": "standard", + "standard": "standard", + "proleptic_gregorian": "proleptic_gregorian", + "julian": "julian", + "noleap": "noleap", + "365_day": "noleap", + "all_366_day": "all_leap", + "all_leap": "all_leap", + "": "", + "none": "", +} + + _calendar_map = {None: "gregorian"} @@ -138,7 +153,7 @@ def dt( (year, month, day, hour, minute, second) = arg.timetuple()[:6] microsecond = arg.microsecond if calendar == "": - calendar = _default_calendar + calendar = default_calendar else: year = arg @@ -426,6 +441,9 @@ def rt2dt(array, units_in, units_out=None, dummy1=None): array, units, calendar, only_use_cftime_datetimes=True ) + if not isinstance(array, np.ndarray): + array = np.array(array, dtype=object) + return array @@ -462,15 +480,17 @@ def dt2rt(array, units_in, units_out, dummy1=None): An array of numbers with the same shape as *array*. """ - ndim = np.ndim(array) + isscalar = np.ndim(array) - # array = units_out._utime.date2num(array) array = cftime.date2num( array, units=units_out.units, calendar=units_out._utime.calendar ) - if not ndim: - array = np.asanyarray(array) + if isscalar: + if array is np.ma.masked: + array = np.ma.masked_all(()) + else: + array = np.asanyarray(array) return array diff --git a/cf/data/creation.py b/cf/data/creation.py index 9adfbe3846..9ba7ad4b40 100644 --- a/cf/data/creation.py +++ b/cf/data/creation.py @@ -70,19 +70,13 @@ def convert_to_builtin_type(x): raise TypeError(f"{type(x)!r} object is not JSON serializable: {x!r}") -def to_dask(array, chunks, dask_from_array_options): +def to_dask(array, chunks, **from_array_options): """TODODASK. .. versionadded:: 4.0.0 """ - if "chunks" in dask_from_array_options: - raise TypeError( - "Can't define chunks in the 'dask_from_array_options' " - "dictionary. Use the 'chunks' parameter instead" - ) - - kwargs = dask_from_array_options.copy() + kwargs = from_array_options kwargs.setdefault("asarray", getattr(array, "dask_asarray", None)) kwargs.setdefault("lock", getattr(array, "dask_lock", False)) diff --git a/cf/data/data.py b/cf/data/data.py index 9aa329a65e..68523323d5 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -129,14 +129,6 @@ scalar_masked_array, ) -# from .chunk_utils import ( # is_small,; is_very_small, -# harden_mask_chunk, -# soften_mask_chunk, -# ) - -# from dask.array import Array - - _DASKIFIED_VERBOSE = None # see below for valid levels, adapt as useful @@ -315,7 +307,7 @@ def __init__( copy=True, dtype=None, mask=None, - dask_from_array_options={}, + init_options=None, _use_array=True, ): """**Initialization** @@ -433,9 +425,34 @@ def __init__( {{chunks: `int`, `tuple`, `dict` or `str`, optional}} - .. versionadded:: 4.0.0 + .. versionadded:: TODODASK + + init_options: `dict`, optional + Provide optional keyword arguments to methods and + functions called during the initialisation process. A + dictionary key identifies a method or function. The + corresponding value is another dictionary whose + key/value pairs are the keyword parameter names and + values to be applied. - chunk: deprecated at version 4.0.0 + Supported keys are: + + * ``'from_array'``: Provide keyword arguments to + the `dask.array.from_array` function. This is used + when initialising data that is not already a dask + array and is not compressed by convention. + + * ``'first_non_missing_value'``: Provide keyword + arguments to the + `cf.data.utils.first_non_missing_value` + function. This is used when the input array contains + date-time strings or objects, and may affect + performance. + + *Parameter example:* + ``{'from_array': {'inline_array': True}}`` + + chunk: deprecated at version TODODASK Use the *chunks* parameter instead. **Examples:** @@ -452,6 +469,9 @@ def __init__( if source is None and isinstance(array, self.__class__): source = array + if init_options is None: + init_options = {} + if source is not None: if loadd is not None: raise ValueError( @@ -560,10 +580,10 @@ def __init__( "Consider rechunking after initialisation." ) - if dask_from_array_options: + if init_options.get("from_array"): raise ValueError( - "Can't define 'dask.array.from_array' parameters for " - "compressed input arrays" + "Can't define 'from_array' initialisation options " + "for compressed input arrays" ) # Save the input compressed array, as this will contain @@ -574,7 +594,15 @@ def __init__( elif not is_dask_collection(array): # Turn the data into a dask array - array = to_dask(array, chunks, dask_from_array_options) + kwargs = init_options.get("from_array", {}) + if "chunks" in kwargs: + raise TypeError( + "Can't define 'chunks' in the 'from_array' " + "initialisation options. " + "Use the 'chunks' parameter instead." + ) + + array = to_dask(array, chunks, **kwargs) elif chunks != _DEFAULT_CHUNKS: # The data is already a dask array @@ -585,15 +613,20 @@ def __init__( ) # Find out if we have an array of date-time objects + if units.isreftime: + dt = True + first_value = None if not dt and array.dtype.kind == "O": - first_value = first_non_missing_value(array) + kwargs = init_options.get("first_non_missing_value", {}) + first_value = first_non_missing_value(array, **kwargs) + if first_value is not None: dt = hasattr(first_value, "timetuple") # Convert string or object date-times to floating point - # reference times, if appropriate. - if array.dtype.kind in "USO" and (dt or units.isreftime): + # reference times + if dt and array.dtype.kind in "USO": array, units = convert_to_reftime(array, units, first_value) # Reset the units self._Units = units @@ -2355,17 +2388,41 @@ def percentile( return d + @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) def persist(self, inplace=False): - """TODODASK. + """Persist the underlaying dask array into memory. - should this be called `to_memory`? This is part of the larger - scheme for memory management + This turns an underlying lazy dask array into a equivalent + chunked dask array, but now with the results fully computed. + + `persist` is particularly useful when using distributed + systems, because the results will be kept in distributed + memory, rather than returned to the local process. + + Compare with `compute` and `array`. **Performance** `persist` causes all delayed operations to be computed. + .. versionadded:: TODODASK + + .. seealso:: `compute`, `array`, `datetime_array`, + `dask.array.Array.persist` + + :Parameters: + + {{inplace: `bool`, optional}} + + :Returns: + + `Data` or `None` + The persisted data. If the operation was in-place then + `None` is returned. + + **Examples** + """ d = _inplace_enabled_define_and_cleanup(self) @@ -2921,6 +2978,51 @@ def ceil(self, inplace=False, i=False): d._set_dask(da.ceil(dx), reset_mask_hardness=False) return d + @daskified(_DASKIFIED_VERBOSE) + def compute(self): + """A numpy view the data. + + In-place changes to the returned numpy array *might* affect + the underlying dask array, depending on how the dask array has + been defined, including any delayed operations. + + The returned numpy array has the same mask hardness and fill + values as the data. + + Compare with `array`. + + **Performance** + + `array` causes all delayed operations to be computed. + + .. versionadded:: TODODASK + + .. seealso:: `persist`, `array`, `datetime_array` + + :Returns: + + `numpy.ndarray` + The numpy view of the data. + + **Examples** + + >>> d = cf.Data([1, 2, 3.0], 'km') + >>> d.compute() + array([1., 2., 3.]) + + """ + a = self._get_dask().compute() + + if np.ma.isMA(a): + if self.hardmask: + a.harden_mask() + else: + a.soften_mask() + + a.set_fill_value(self.fill_value) + + return a + @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) def convolution_filter( @@ -6291,20 +6393,23 @@ def size(self): @property @daskified(_DASKIFIED_VERBOSE) def array(self): - """A numpy array copy the data array. + """A numpy array copy of the data. - .. note:: If the data array is stored as date-time objects then a - numpy array of numeric reference times will be - returned. A numpy array of date-time objects may be - returned by the `datetime_array` attribute. + In-place changes to the returned numpy array do not affect the + underlying dask array. - **Performance** + The returned numpy array has the same mask hardness and fill + values as the data. + + Compare with `compute`. + + **Performance** - `array` causes all delayed operations to be computed. + `array` causes all delayed operations to be computed. - .. seealso:: `datetime_array`, `varray` + .. seealso:: `datetime_array`, `compute`, `persist` - **Examples:** + **Examples** >>> d = cf.Data([1, 2, 3.0], 'km') >>> a = d.array @@ -6319,37 +6424,34 @@ def array(self): >>> print(d[0]) -99.0 km - """ - dx = self._get_dask() - a = dx.compute() - - if np.ma.isMA(a): - if self.hardmask: - a.harden_mask() - else: - a.soften_mask() + >>> d = cf.Data('2000-12-1', units='days since 1999-12-1') + >>> print(d.array) + 366 + >>> print(d.datetime_array) + 2000-12-01 00:00:00 - return a + """ + return self.compute().copy() @property @daskified(_DASKIFIED_VERBOSE) def datetime_array(self): """An independent numpy array of date-time objects. - Only applicable to data arrays with reference time units. + Only applicable to data arrays with reference time units. - If the calendar has not been set then the CF default calendar will - be used and the units will be updated accordingly. + If the calendar has not been set then the CF default calendar will + be used and the units will be updated accordingly. - The data-type of the data array is unchanged. + The data-type of the data array is unchanged. - .. seealso:: `array` + .. seealso:: `array`, `compute`, `persist` - **Examples:** + **Performance** - **Performance** + `datetime_array` causes all delayed operations to be computed. - `datetime_array` causes all delayed operations to be computed. + **Examples** """ units = self.Units @@ -6391,7 +6493,7 @@ def datetime_array(self): d = self dx = d._get_dask() - dx = convert_to_datetime(dx, d.Units) # TODODASK + dx = convert_to_datetime(dx, d.Units) a = dx.compute() @@ -6401,9 +6503,26 @@ def datetime_array(self): else: a.soften_mask() + a.set_fill_value(self.fill_value) + return a @property + @daskified(_DASKIFIED_VERBOSE) + def varray(self): + """A numpy array view of the data array. + + Deprecated at version TODODASK. + + .. seealso:: `array`, `datetime_array`, `compute`, `persist` + + """ + raise NotImplementedError( + "The varray method was deprecated at version TODODASK" + ) + + @property + @daskified(_DASKIFIED_VERBOSE) def mask(self): """The Boolean missing data mask of the data array. @@ -6430,7 +6549,7 @@ def mask(self): dx = self._get_dask() mask = da.ma.getmaskarray(dx) - mask_data_obj._set_dask(mask, reset_mask_hardness=True) + mask_data_obj._set_dask(mask, reset_mask_hardness=False) mask_data_obj.override_units(_units_None, inplace=True) mask_data_obj.hardmask = True @@ -8444,10 +8563,9 @@ def asdata(cls, d, dtype=None, copy=False): data = data.copy() if dtype is not None and np.dtype(dtype) != data.dtype: data.dtype = dtype - else: - if dtype is not None and np.dtype(dtype) != data.dtype: - data = data.copy() - data.dtype = dtype + elif dtype is not None and np.dtype(dtype) != data.dtype: + data = data.copy() + data.dtype = dtype return data diff --git a/cf/data/utils.py b/cf/data/utils.py index cda7f77c6e..f249412bfe 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -5,8 +5,14 @@ import dask.array as da import numpy as np -from ..cfdatetime import dt as cf_dt -from ..cfdatetime import dt2rt, rt2dt, st2rt +from ..cfdatetime import ( + canonical_calendar, + default_calendar, + dt, + dt2rt, + rt2dt, + st2rt, +) from ..units import Units from .dask_utils import cf_YMDhms @@ -28,22 +34,22 @@ def _is_numeric_dtype(array): **Examples:** >>> a = np.array([0, 1, 2]) - >>> _is_numeric_dtype(a) + >>> cf.data.utils._is_numeric_dtype(a) True >>> a = np.array([False, True, True]) - >>> _is_numeric_dtype(a) + >>> cf.data.utils._is_numeric_dtype(a) True >>> a = np.array(["a", "b", "c"], dtype="S1") - >>> _is_numeric_dtype(a) + >>> cf.data.utils._is_numeric_dtype(a) False >>> a = np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]) - >>> _is_numeric_dtype(a) + >>> cf.data.utils._is_numeric_dtype(a) True >>> a = np.array(10) - >>> _is_numeric_dtype(a) + >>> cf.data.utils._is_numeric_dtype(a) True >>> a = np.empty(1, dtype=object) - >>> _is_numeric_dtype(a) + >>> cf.data.utils._is_numeric_dtype(a) False """ @@ -61,177 +67,261 @@ def _is_numeric_dtype(array): return np.issubdtype(dtype, np.number) or np.issubdtype(dtype, np.bool_) -def convert_to_datetime(array, units): - """Convert a daskarray to. +def convert_to_datetime(a, units): + """Convert a dask array of numbers to one of date-time objects. - .. versionadded:: 4.0.0 + .. versionadded:: TODODASK - :Parameters: + .. seealso `convert_to_reftime` - array: dask array + :Parameters: - units : `Units` + a: `dask.array.Array` + The input numeric reference time values. - :Returns: + units: `Units` + The reference time units that define the output + date-time objects. + + :Returns: + + `dask.array.Array` + A new dask array containing date-time objects. + + **Examples** - dask array - A new dask array containing datetime objects. + >>> import dask.array as da + >>> d = da.from_array(2.5) + >>> e = cf.data.utils.convert_to_datetime(d, cf.Units("days since 2000-12-01")) + >>> print(e.compute()) + 2000-12-03 12:00:00 """ - dx = array.map_blocks(partial(rt2dt, units_in=units), dtype=object) - return dx + return a.map_blocks( + partial(rt2dt, units_in=units), + dtype=object, + meta=np.array((), dtype=object), + ) -def convert_to_reftime(array, units, first_value=None): +def convert_to_reftime(a, units=None, first_value=None): """Convert a dask array of string or object date-times to floating point reference times. - .. versionadded:: 4.0.0 + .. versionadded:: TODODASK - :Parameters: + .. seealso `convert_to_datetime` - array: dask array + :Parameters: - units : `Units` + a: `dask.array.Array` - first_value : scalar, optional + units: `Units`, optional + Specify the units for the output reference time + values. By default the units are inferred from the first + non-missing value in the array, or set to ```` if all values are missing. - :Returns: + first_value: optional + If set, then assumed to be equal to the first non-missing + value of the array, thereby removing the need to find it + by inspection of *a*, which may be expensive. By default + the first non-missing value is found from *a*. - dask array, `Units` - A new dask array containing reference times, and its - units. + :Returns: + + (`dask.array.Array`, `Units`) + The reference times, and their units. + + >>> import dask.array as da + >>> d = da.from_array(2.5) + >>> e = cf.data.utils.convert_to_datetime(d, cf.Units("days since 2000-12-01")) + + >>> f, u = cf.data.utils.convert_to_reftime(e) + >>> f.compute() + 0.5 + >>> u + + + >>> f, u = cf.data.utils.convert_to_reftime(e, cf.Units("days since 1999-12-01")) + >>> f.compute() + 368.5 + >>> u + """ - kind = array.dtype.kind + kind = a.dtype.kind if kind in "US": # Convert date-time strings to reference time floats if not units: - value = first_value(array, first_value) - if value is not None: - YMD = str(value).partition("T")[0] + first_value = first_non_missing_value(a, cached=first_value) + if first_value is not None: + YMD = str(first_value).partition("T")[0] else: YMD = "1970-01-01" - units = Units("days since " + YMD, units._calendar) + units = Units("days since " + YMD, default_calendar) - array = array.map_blocks( + a = a.map_blocks( partial(st2rt, units_in=units, units_out=units), dtype=float ) elif kind == "O": # Convert date-time objects to reference time floats - value = first_value(array, first_value) - if value is not None: - x = value + first_value = first_non_missing_value(a, cached=first_value) + if first_value is not None: + x = first_value else: - x = cf_dt(1970, 1, 1, calendar="gregorian") + x = dt(1970, 1, 1, calendar=default_calendar) x_since = "days since " + "-".join(map(str, (x.year, x.month, x.day))) - x_calendar = getattr(x, "calendar", "gregorian") + x_calendar = getattr(x, "calendar", default_calendar) d_calendar = getattr(units, "calendar", None) d_units = getattr(units, "units", None) if x_calendar != "": - if d_calendar is not None: - if not units.equivalent(Units(x_since, x_calendar)): - raise ValueError( - f"Incompatible units: " - f"{units!r}, {Units(x_since, x_calendar)!r}" - ) - else: + if units is None: d_calendar = x_calendar + elif not units.equivalent(Units(x_since, x_calendar)): + raise ValueError( + "Incompatible units: " + f"{units!r}, {Units(x_since, x_calendar)!r}" + ) if not units: - # Set the units to something that is (hopefully) - # close to all of the datetimes, in an attempt to - # reduce errors arising from the conversion to - # reference times + # Set the units to something that is (hopefully) close to + # all of the datetimes, in an attempt to reduce errors + # arising from the conversion to reference times units = Units(x_since, calendar=d_calendar) else: units = Units(d_units, calendar=d_calendar) - # Check that all date-time objects have correct and - # equivalent calendars - calendars = unique_calendars(array) - if len(calendars) > 1: - raise ValueError( - "Not all date-time objects have equivalent " - f"calendars: {tuple(calendars)}" - ) - - # If the date-times are calendar-agnostic, assign the - # given calendar, defaulting to Gregorian. - if calendars.pop() == "": - calendar = getattr(units, "calendar", "gregorian") - - # TODODASK: can map_blocks this, I think - new_array = da.empty_like(array, dtype=object) - for i in np.ndindex(new_array.shape): - new_array[i] = cf_dt(array[i], calendar=calendar) - - array = new_array - # Convert the date-time objects to reference times - array = array.map_blocks(dt2rt, units_out=units, dtype=float) + a = a.map_blocks(dt2rt, units_in=None, units_out=units, dtype=float) if not units.isreftime: raise ValueError( f"Can't create a reference time array with units {units!r}" ) - return array, units + return a, units -def first_non_missing_value(array, cached=None): - """Return the first non-missing value of an array. +def first_non_missing_value(a, cached=None, method="index"): + """Return the first non-missing value of a dask array. - If the array contains only missing data then `None` is returned. + .. versionadded:: TODODASK - If a cached value is provided then that is returned without - looking for the actual first non-missing value. + :Parameters: - .. versionadded:: 4.0.0 + a: `dask.array.Array` + The array to be inspected. - :Parameters: + cached: scalar, optional + If set to a value other than `None`, then return without + inspecting the array. This allows a previously found first + value to be used instead of a potentially costly array + access. + + method: `str`, optional + Select the method used to find the first non-missing + value. - array: dask array - The array to be inspected. + The default ``'index'`` method evaulates sequentially the + elements of the flattened array and returns when the first + non-missing value is found. - cached: scalar, optional - If set to a value other than `Ǹone`, then return this value - instead of inspecting the array. + The ``'mask'`` method finds the first non-missing value of + the flattened array as that which has the same location as + the first False element of the flattened array mask. + + It is considered likely that the ``'index'`` method is + fastest for data for which the first element is not + missing, but this may not always be the case. :Returns: - If the *cached* parameter is set then its value is - returned. Otherwise return the first non-missing value, or - `None` if there isn't one. + If set, then *cached* is returned. Otherwise returns the + first non-missing value of *a*, or `None` if there isn't + one. + + **Examples** + + >>> import dask.array as da + >>> d = da.arange(8).reshape(2, 4) + >>> print(d.compute()) + [[0 1 2 3] + [4 5 6 7]] + >>> cf.data.utils.first_non_missing_value(d) + 0 + >>> cf.data.utils.first_non_missing_value(d, cached=99) + 99 + >>> d[0, 0] = np.ma.masked + >>> cf.data.utils.first_non_missing_value(d) + 1 + >>> d[0, :] = np.ma.masked + >>> cf.data.utils.first_non_missing_value(d) + 4 + >>> cf.data.utils.first_non_missing_value(d, cached=99) + 99 + >>> d[...] = np.ma.masked + >>> print(cf.data.utils.first_non_missing_value(d)) + None + >>> print(cf.data.utils.first_non_missing_value(d, cached=99)) + 99 """ if cached is not None: return cached - # This does not look particularly efficient, but the expectation - # is that the first element in the array will not be missing data. - - shape = array.shape - for i in range(array.size): - index = np.unravel_index(i, shape) - x = array[index].compute() + if method == "index": + shape = a.shape + for i in range(a.size): + index = np.unravel_index(i, shape) + x = a[index].compute() + if not (x is np.ma.masked or np.ma.getmask(x)): + try: + return x.item() + except AttributeError: + return x + + return + + if method == "mask": + mask = da.ma.getmaskarray(a) + if not a.ndim: + # Scalar data + if mask: + return + + a = a.compute() + try: + return a.item() + except AttributeError: + return a + + x = a[da.unravel_index(mask.argmin(), a.shape)].compute() if x is np.ma.masked: - continue + return - return x.item() + try: + return x.item() + except AttributeError: + return x - return None + raise ValueError(f"Unknown value of 'method': {method!r}") -def unique_calendars(array): +def unique_calendars(a): """Find the unique calendars from a dask array of date-time objects. - .. versionadded:: 4.0.0 + .. versionadded:: TODODASK + + :Parameters: + + array: `dask.array.Array` + A dask array of data-time objects. :Returns: @@ -241,21 +331,28 @@ def unique_calendars(array): """ def _get_calendar(x): - getattr(x, "calendar", "gregorian") + return getattr(x, "calendar", default_calendar) _calendars = np.vectorize(_get_calendar, otypes=[np.dtype(str)]) - array = array.map_blocks(_calendars, dtype=str) + # TODODASK + # + # da.unique doesn't work well with masked data (2022-02-07), so do + # move to numpy-space for now. When da.unique is better we can + # replace the next two lines of code with: + # + # a = a.map_blocks(_calendars, dtype=str) + # calendars = da.unique(array).compute() + a = _calendars(a.compute()) + calendars = np.unique(a) - cals = da.unique(array).compute() - if np.ma.isMA(cals): - cals = cals.compressed() + if np.ma.isMA(calendars): + calendars = calendars.compressed() - # TODODASK - need to allow differetn bu equivalent calendars, such - # as "gregorian" and 'standard'. Or perhaps this should by the - # caller? + # Replace each calendar with its canonical name + out = [canonical_calendar[cal] for cal in calendars.tolist()] - return set(cals.tolist()) + return set(out) @lru_cache(maxsize=32) @@ -282,29 +379,29 @@ def new_axis_identifier(existing_axes=(), basename="dim"): **Examples:** - >>> new_axis_identifier() + >>> cf.data.utils.new_axis_identifier() 'dim0' - >>> new_axis_identifier(['dim0']) + >>> cf.data.utils.new_axis_identifier(['dim0']) 'dim1' - >>> new_axis_identifier(['dim3']) + >>> cf.data.utils.new_axis_identifier(['dim3']) 'dim1' - >>> new_axis_identifier(['dim1']) + >>> cf.data.utils.new_axis_identifier(['dim1']) 'dim2' - >>> new_axis_identifier(['dim1', 'dim0']) + >>> cf.data.utils.new_axis_identifier(['dim1', 'dim0']) 'dim2' - >>> new_axis_identifier(['dim3', 'dim4']) + >>> cf.data.utils.new_axis_identifier(['dim3', 'dim4']) 'dim2' - >>> new_axis_identifier(['dim2', 'dim0']) + >>> cf.data.utils.new_axis_identifier(['dim2', 'dim0']) 'dim3' - >>> new_axis_identifier(['dim3', 'dim4', 'dim0']) + >>> cf.data.utils.new_axis_identifier(['dim3', 'dim4', 'dim0']) 'dim5' - >>> d._new_axis_identifier(basename='axis') + >>> cf.data.utils.new_axis_identifier(basename='axis') 'axis0' - >>> d._new_axis_identifier(basename='axis') + >>> cf.data.utils.new_axis_identifier(basename='axis') 'axis0' - >>> d._new_axis_identifier(['dim0'], basename='axis') + >>> cf.data.utils.new_axis_identifier(['dim0'], basename='axis') 'axis1' - >>> d._new_axis_identifier(['dim0', 'dim1'], basename='axis') + >>> cf.data.utils.new_axis_identifier(['dim0', 'dim1'], basename='axis') 'axis2' """ @@ -333,7 +430,7 @@ def chunk_positions(chunks): **Examples** >>> chunks = ((1, 2), (9,), (44, 55, 66)) - >>> for position in chunk_positions(chunks): + >>> for position in cf.data.utils.chunk_positions(chunks): ... print(position) ... (0, 0, 0) @@ -363,7 +460,7 @@ def chunk_shapes(chunks): **Examples** >>> chunks = ((1, 2), (9,), (4, 5, 6)) - >>> for shape in chunk_shapes(chunks): + >>> for shape in cf.data.utils.chunk_shapes(chunks): ... print(shape) ... (1, 9, 4) @@ -441,22 +538,22 @@ def scalar_masked_array(dtype=float): **Examples** - >>> scalar_masked_array() + >>> cf.data.utils.scalar_masked_array() masked_array(data=--, mask=True, fill_value=1e+20, dtype=float64) - >>> scalar_masked_array(dtype('int32')) + >>> cf.data.utils.scalar_masked_array(dtype('int32')) masked_array(data=--, mask=True, fill_value=999999, dtype=int32) - >>> scalar_masked_array('U45') + >>> cf.data.utils.scalar_masked_array('U45') masked_array(data=--, mask=True, fill_value='N/A', dtype='>> scalar_masked_array(bool) + >>> cf.data.utils.scalar_masked_array(bool) masked_array(data=--, mask=True, fill_value=True, @@ -496,22 +593,22 @@ def conform_units(value, units): **Examples** - >>> conform_units(1, cf.Units('metres')) + >>> cf.data.utils.conform_units(1, cf.Units('metres')) 1 - >>> conform_units([1, 2, 3], cf.Units('metres')) + >>> cf.data.utils.conform_units([1, 2, 3], cf.Units('metres')) [1, 2, 3] - >>> import numpy - >>> conform_units(numpy.array([1, 2, 3]), cf.Units('metres')) + >>> import numpy as np + >>> cf.data.utils.conform_units(np.array([1, 2, 3]), cf.Units('metres')) array([1, 2, 3]) - >>> conform_units('string', cf.Units('metres')) + >>> cf.data.utils.conform_units('string', cf.Units('metres')) 'string' >>> d = cf.Data([1, 2] , 'm') - >>> conform_units(d, cf.Units('metres')) + >>> cf.data.utils.conform_units(d, cf.Units('metres')) >>> d = cf.Data([1, 2] , 'km') - >>> conform_units(d, cf.Units('metres')) + >>> cf.data.utils.conform_units(d, cf.Units('metres')) - >>> conform_units(d, cf.Units('s')) + >>> cf.data.utils.conform_units(d, cf.Units('s')) ... ValueError: Units are incompatible with units @@ -562,7 +659,7 @@ def YMDhms(d, attr): **Examples** >>> d = cf.Data([0, 1, 2], 'days since 1999-12-31') - >>> YMDhms(d, 'year').array + >>> cf.data.utils.YMDhms(d, 'year').array >>> array([1999, 2000, 2000]) """ diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 175a8c3a7e..7c25131a7d 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -1463,7 +1463,6 @@ def test_Data_any(self): d[...] = cf.masked self.assertFalse(d.any()) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "AssertionError: -999 != 0") def test_Data_array(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -1479,19 +1478,21 @@ def test_Data_array(self): self.assertIs(a[()], np.ma.masked) # Non-scalar numeric array - b = np.arange(10 * 15 * 19).reshape(10, 1, 15, 19) - d = cf.Data(b, "km") + b = np.arange(24).reshape(2, 1, 3, 4) + d = cf.Data(b, "km", fill_value=-123) a = d.array a[0, 0, 0, 0] = -999 a2 = d.array - self.assertEqual(a2[0, 0, 0, 0], 0) - self.assertEqual(a2.shape, b.shape) self.assertTrue((a2 == b).all()) self.assertFalse((a2 == a).all()) + # Fill value + d[0, 0, 0, 0] = cf.masked + self.assertEqual(d.array.fill_value, d.fill_value) + + # Date-time array d = cf.Data([["2000-12-3 12:00"]], "days since 2000-12-01", dt=True) - a = d.array - self.assertTrue((a == np.array([[2.5]])).all()) + self.assertEqual(d.array, 2.5) @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attr. 'partition_configuration'") def test_Data_binary_mask(self): @@ -1611,7 +1612,6 @@ def test_Data_months_years(self): ) d *= 31 - @unittest.skipIf(TEST_DASKIFIED_ONLY, "'NoneType' object is not callable") def test_Data_datetime_array(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -1634,11 +1634,6 @@ def test_Data_datetime_array(self): self.assertEqual(a.shape, ()) self.assertEqual(a, x) - a = d.datetime_array - a = d.array - self.assertEqual(a.shape, ()) - self.assertEqual(a, x) - # Non-scalar array for d, x in zip( [ @@ -1647,12 +1642,6 @@ def test_Data_datetime_array(self): ], ([[11292.5, 11293.5]], [[0, 1]]), ): - a = d.datetime_array - a = d.array - self.assertTrue((a == x).all()) - a = d.datetime_array - a = d.array - self.assertTrue((a == x).all()) a = d.datetime_array self.assertTrue( ( @@ -1668,6 +1657,9 @@ def test_Data_datetime_array(self): ).all() ) + a = d.array + self.assertTrue((a == x).all()) + def test_Data_asdatetime_asreftime_isdatetime(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -1981,38 +1973,6 @@ def test_Data_unique(self): d[1, -1] = cf.masked self.assertTrue((d.unique() == cf.Data([1, 2, 4], "metre")).all()) - @unittest.skipIf( - TEST_DASKIFIED_ONLY, "hits 'TODODASK - use harden_mask/soften_mask'" - ) - def test_Data_varray(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - - # Scalar array - d = cf.Data(9, "km") - d.hardmask = False - a = d.varray - self.assertEqual(a.shape, ()) - self.assertEqual(a, np.array(9)) - d[...] = cf.masked - a = d.varray - self.assertEqual(a.shape, ()) - self.assertIs(a[()], np.ma.masked) - a[()] = 18 - self.assertEqual(a, np.array(18)) - - b = np.arange(10 * 15 * 19).reshape(10, 1, 15, 19) - d = cf.Data(b, "km") - e = d.copy() - v = e.varray - v[0, 0, 0, 0] = -999 - v = e.varray - self.assertEqual(v[0, 0, 0, 0], -999) - self.assertEqual(v.shape, b.shape) - self.assertFalse((v == b).all()) - v[0, 0, 0, 0] = 0 - self.assertTrue((v == b).all()) - def test_Data_year_month_day_hour_minute_second(self): if self.test_only and inspect.stack()[0][3] not in self.test_only: return @@ -3831,6 +3791,48 @@ def test_Data__bool__(self): with self.assertRaises(ValueError): bool(cf.Data([1, 2])) + def test_Data_compute(self): + if self.test_only and inspect.stack()[0][3] not in self.test_only: + return + + # Scalar numeric array + d = cf.Data(9, "km") + a = d.compute() + self.assertIsInstance(a, np.ndarray) + self.assertEqual(a.shape, ()) + self.assertEqual(a, np.array(9)) + d[...] = cf.masked + a = d.compute() + self.assertEqual(a.shape, ()) + self.assertIs(a[()], np.ma.masked) + + # Non-scalar numeric array + b = np.arange(24).reshape(2, 1, 3, 4) + d = cf.Data(b, "km", fill_value=-123) + a = d.compute() + self.assertTrue((a == b).all()) + + # Fill value + d[0, 0, 0, 0] = cf.masked + self.assertEqual(d.compute().fill_value, d.fill_value) + + # Date-time array + d = cf.Data([["2000-12-3 12:00"]], "days since 2000-12-01", dt=True) + self.assertEqual(d.compute(), 2.5) + + def test_Data_persist(self): + if self.test_only and inspect.stack()[0][3] not in self.test_only: + return + + d = cf.Data(9, "km") + self.assertIsNone(d.persist(inplace=True)) + + # Scalar numeric array + d = cf.Data([1, 2, 3.0, 4], "km", mask=[0, 1, 0, 0], chunks=2) + e = d.persist() + self.assertIsInstance(e, cf.Data) + self.assertTrue(e.equals(d)) + def test_Data_cyclic(self): d = cf.Data(np.arange(12).reshape(3, 4)) self.assertEqual(d.cyclic(), set()) diff --git a/cf/test/test_Data_utils.py b/cf/test/test_Data_utils.py index 859e6241ee..78689b7b76 100644 --- a/cf/test/test_Data_utils.py +++ b/cf/test/test_Data_utils.py @@ -2,6 +2,7 @@ import faulthandler import unittest +import cftime import dask.array as da import numpy as np @@ -85,6 +86,191 @@ def test_Data_Utils__is_numeric_dtype(self): ]: self.assertFalse(_is_numeric_dtype(b)) + def test_Data_Utils_convert_to_datetime(self): + """TODO.""" + a = cftime.DatetimeGregorian(2000, 12, 3, 12) + for x in (2.5, [2.5]): + d = da.from_array(x) + e = cf.data.utils.convert_to_datetime( + d, cf.Units("days since 2000-12-01") + ) + self.assertEqual(e.compute(), a) + + a = [ + cftime.DatetimeGregorian(2000, 12, 1), + cftime.DatetimeGregorian(2000, 12, 2), + cftime.DatetimeGregorian(2000, 12, 3), + ] + for x in ([0, 1, 2], [[0, 1, 2]]): + d = da.from_array([0, 1, 2], chunks=2) + e = cf.data.utils.convert_to_datetime( + d, cf.Units("days since 2000-12-01") + ) + self.assertTrue((e.compute() == a).all()) + + def test_Data_Utils_convert_to_reftime(self): + """TODO.""" + a = cftime.DatetimeGregorian(2000, 12, 3, 12) + d = da.from_array(np.array(a, dtype=object)) + + e, u = cf.data.utils.convert_to_reftime(d) + self.assertEqual(e.compute(), 0.5) + self.assertEqual(u, cf.Units("days since 2000-12-03", "standard")) + + units = cf.Units("days since 2000-12-01") + e, u = cf.data.utils.convert_to_reftime(d, units=units) + self.assertEqual(e.compute(), 2.5) + self.assertEqual(u, units) + + a = "2000-12-03T12:00" + d = da.from_array(np.array(a, dtype=str)) + + e, u = cf.data.utils.convert_to_reftime(d) + self.assertEqual(e.compute(), 0.5) + self.assertEqual(u, cf.Units("days since 2000-12-03", "standard")) + + units = cf.Units("days since 2000-12-01") + e, u = cf.data.utils.convert_to_reftime(d, units=units) + self.assertEqual(e.compute(), 2.5) + self.assertEqual(u, units) + + a = [ + [ + cftime.DatetimeGregorian(2000, 12, 1), + cftime.DatetimeGregorian(2000, 12, 2), + cftime.DatetimeGregorian(2000, 12, 3), + ] + ] + d = da.from_array(np.ma.array(a, mask=[[1, 0, 0]]), chunks=2) + + e, u = cf.data.utils.convert_to_reftime(d) + self.assertTrue((e.compute() == [-99, 0, 1]).all()) + self.assertEqual(u, cf.Units("days since 2000-12-02", "standard")) + + units = cf.Units("days since 2000-12-03") + e, u = cf.data.utils.convert_to_reftime(d, units=units) + self.assertTrue((e.compute() == [-99, -1, 0]).all()) + self.assertEqual(u, units) + + def test_Data_Utils_unique_calendars(self): + """TODO.""" + a = [ + [ + cftime.DatetimeGregorian(2000, 12, 1), + cftime.DatetimeGregorian(2000, 12, 2), + cftime.DatetimeGregorian(2000, 12, 3), + ] + ] + d = da.from_array(np.ma.array(a, mask=[[1, 0, 0]]), chunks=2) + c = cf.data.utils.unique_calendars(d) + self.assertIsInstance(c, set) + self.assertEqual(c, set(["standard"])) + + a = cftime.DatetimeGregorian(2000, 12, 1) + d = da.from_array(np.array(a, dtype=object)) + c = cf.data.utils.unique_calendars(d) + self.assertEqual(c, set(["standard"])) + + d[()] = np.ma.masked + c = cf.data.utils.unique_calendars(d) + self.assertEqual(c, set()) + + a = [ + cftime.DatetimeGregorian(2000, 12, 1), + cftime.DatetimeAllLeap(2000, 12, 2), + cftime.DatetimeGregorian(2000, 12, 3), + ] + d = da.from_array(np.ma.array(a, mask=[1, 0, 0]), chunks=2) + c = cf.data.utils.unique_calendars(d) + self.assertEqual(c, set(["all_leap", "standard"])) + + def test_Data_Utils_first_non_missing_value(self): + """TODO.""" + for method in ("index", "mask"): + # Scalar data + d = da.from_array(0) + self.assertEqual( + cf.data.utils.first_non_missing_value(d, method=method), 0 + ) + self.assertEqual( + cf.data.utils.first_non_missing_value( + d, cached=99, method=method + ), + 99, + ) + + d[()] = np.ma.masked + self.assertEqual( + cf.data.utils.first_non_missing_value(d, method=method), None + ) + self.assertEqual( + cf.data.utils.first_non_missing_value( + d, cached=99, method=method + ), + 99, + ) + + # 1-d data + d = da.arange(8) + self.assertEqual( + cf.data.utils.first_non_missing_value(d, method=method), 0 + ) + self.assertEqual( + cf.data.utils.first_non_missing_value( + d, cached=99, method=method + ), + 99, + ) + + d[0] = np.ma.masked + self.assertEqual( + cf.data.utils.first_non_missing_value(d, method=method), 1 + ) + self.assertEqual( + cf.data.utils.first_non_missing_value( + d, cached=99, method=method + ), + 99, + ) + + # 2-d data + d = da.arange(8).reshape(2, 4) + self.assertEqual( + cf.data.utils.first_non_missing_value(d, method=method), 0 + ) + self.assertEqual( + cf.data.utils.first_non_missing_value( + d, cached=99, method=method + ), + 99, + ) + + d[0] = np.ma.masked + self.assertEqual( + cf.data.utils.first_non_missing_value(d, method=method), 4 + ) + self.assertEqual( + cf.data.utils.first_non_missing_value( + d, cached=99, method=method + ), + 99, + ) + + d[...] = np.ma.masked + self.assertEqual( + cf.data.utils.first_non_missing_value(d, method=method), None + ) + self.assertEqual( + cf.data.utils.first_non_missing_value( + d, cached=99, method=method + ), + 99, + ) + + # Bad method + with self.assertRaises(ValueError): + cf.data.utils.first_non_missing_value(d, method="bad") + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cf/timeduration.py b/cf/timeduration.py index babc7622a9..6a0e9e7498 100644 --- a/cf/timeduration.py +++ b/cf/timeduration.py @@ -34,7 +34,7 @@ # Default month lengths in days _default_month_lengths = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] -_default_calendar = "gregorian" +default_calendar = "gregorian" Offset = namedtuple( "offset", @@ -1436,9 +1436,9 @@ def _dHMS(duration, dt, end): else: return dt1, dt # dt1, dt.copy() - calendar = getattr(dt, "calendar", _default_calendar) + calendar = getattr(dt, "calendar", default_calendar) if calendar == "": - calendar = _default_calendar + calendar = default_calendar dt = cf_dt(dt, calendar=calendar) @@ -1572,9 +1572,9 @@ def bounds(self, dt, direction=True): """ abs_self = abs(self) - calendar = getattr(dt, "calendar", _default_calendar) + calendar = getattr(dt, "calendar", default_calendar) if calendar == "": - calendar = _default_calendar + calendar = default_calendar dt = cf_dt(dt, calendar=calendar) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 8c30d79eab..fb2f970e63 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -187,7 +187,7 @@ Required * `netCDF4 `_, 1.5.4 or newer. -* `cftime `_, version 1.5.0 or newer +* `cftime `_, version 1.6.0 or newer (note that this package may be installed with netCDF4). * `cfdm `_, version 1.9.0.1 or up to, diff --git a/requirements.txt b/requirements.txt index f1964f302d..46a3360f0b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ netCDF4>=1.5.4 -cftime>=1.5.0 +cftime>=1.6.0 numpy>=1.22 cfdm>=1.9.0.1, <1.9.1.0 psutil>=0.6.0