diff --git a/.travis.yml b/.travis.yml index ea9ee7adcf4..212ddb77daa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -65,7 +65,7 @@ script: elif [[ "$CONDA_ENV" == "py36-hypothesis" ]]; then pytest properties ; else - py.test xarray --cov=xarray --cov-config ci/.coveragerc --cov-report term-missing --verbose $EXTRA_FLAGS; + py.test xarray --cov=xarray --cov-config ci/.coveragerc --cov-report term-missing $EXTRA_FLAGS; fi after_success: diff --git a/README.rst b/README.rst index 6dbf774549d..83382f87ed5 100644 --- a/README.rst +++ b/README.rst @@ -8,9 +8,9 @@ xarray: N-D labeled arrays and datasets .. image:: https://coveralls.io/repos/pydata/xarray/badge.svg :target: https://coveralls.io/r/pydata/xarray .. image:: https://readthedocs.org/projects/xray/badge/?version=latest - :target: http://xarray.pydata.org/ -.. image:: http://img.shields.io/badge/benchmarked%20by-asv-green.svg?style=flat - :target: http://pandas.pydata.org/speed/xarray/ + :target: https://xarray.pydata.org/ +.. image:: https://img.shields.io/badge/benchmarked%20by-asv-green.svg?style=flat + :target: https://pandas.pydata.org/speed/xarray/ .. image:: https://img.shields.io/pypi/v/xarray.svg :target: https://pypi.python.org/pypi/xarray/ @@ -30,10 +30,10 @@ It is particularly tailored to working with netCDF_ files, which were the source of xarray's data model, and integrates tightly with dask_ for parallel computing. -.. _NumPy: http://www.numpy.org -.. _pandas: http://pandas.pydata.org -.. _dask: http://dask.org -.. _netCDF: http://www.unidata.ucar.edu/software/netcdf +.. _NumPy: https://www.numpy.org +.. _pandas: https://pandas.pydata.org +.. _dask: https://dask.org +.. _netCDF: https://www.unidata.ucar.edu/software/netcdf Why xarray? ----------- @@ -66,12 +66,12 @@ powerful and concise interface. For example: Documentation ------------- -Learn more about xarray in its official documentation at http://xarray.pydata.org/ +Learn more about xarray in its official documentation at https://xarray.pydata.org/ Contributing ------------ -You can find information about contributing to xarray at our `Contributing page `_. +You can find information about contributing to xarray at our `Contributing page `_. Get in touch ------------ @@ -81,9 +81,9 @@ Get in touch - For less well defined questions or ideas, or to announce other projects of interest to xarray users, use the `mailing list`_. -.. _StackOverFlow: http://stackoverflow.com/questions/tagged/python-xarray +.. _StackOverFlow: https://stackoverflow.com/questions/tagged/python-xarray .. _mailing list: https://groups.google.com/forum/#!forum/xarray -.. _on GitHub: http://github.com/pydata/xarray +.. _on GitHub: https://github.com/pydata/xarray NumFOCUS -------- @@ -120,7 +120,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9c88445b5ba..ed3d2d60442 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -13,25 +13,37 @@ What's New import xarray as xr np.random.seed(123456) -.. _whats-new.0.12.1: +.. _whats-new.0.12.2: -v0.12.1 (unreleased) +v0.12.2 (unreleased) -------------------- Enhancements ~~~~~~~~~~~~ +Bug fixes +~~~~~~~~~ + +.. _whats-new.0.12.1: + +v0.12.1 (4 April 2019) +---------------------- + +Enhancements +~~~~~~~~~~~~ + - Allow ``expand_dims`` method to support inserting/broadcasting dimensions with size > 1. (:issue:`2710`) By `Martin Pletcher `_. - Bug fixes ~~~~~~~~~ - Dataset.copy(deep=True) now creates a deep copy of the attrs (:issue:`2835`). By `Andras Gefferth `_. -- ``swap_dims`` would create incorrect ``indexes`` (:issue:`2842`). +- Fix incorrect ``indexes`` resulting from various ``Dataset`` operations + (e.g., ``swap_dims``, ``isel``, ``reindex``, ``[]``) (:issue:`2842`, + :issue:`2856`). By `Stephan Hoyer `_. .. _whats-new.0.12.0: @@ -121,6 +133,11 @@ Other enhancements By `Keisuke Fujii `_. - Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`). By `Kevin Squire `_. +- ``xr.open_zarr`` now accepts manually specified chunks with the ``chunks=`` + parameter. ``auto_chunk=True`` is equivalent to ``chunks='auto'`` for + backwards compatibility. The ``overwrite_encoded_chunks`` parameter is + added to remove the original zarr chunk encoding. + By `Lily Wang `_. Bug fixes ~~~~~~~~~ diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index ee77e0833c4..f5364314af8 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,3 +1,4 @@ +import warnings from collections import OrderedDict from distutils.version import LooseVersion @@ -352,10 +353,11 @@ def close(self): zarr.consolidate_metadata(self.ds.store) -def open_zarr(store, group=None, synchronizer=None, auto_chunk=True, +def open_zarr(store, group=None, synchronizer=None, chunks='auto', decode_cf=True, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, - drop_variables=None, consolidated=False): + drop_variables=None, consolidated=False, + overwrite_encoded_chunks=False, **kwargs): """Load and decode a dataset from a Zarr store. .. note:: Experimental @@ -375,10 +377,15 @@ def open_zarr(store, group=None, synchronizer=None, auto_chunk=True, Array synchronizer provided to zarr group : str, obtional Group path. (a.k.a. `path` in zarr terminology.) - auto_chunk : bool, optional - Whether to automatically create dask chunks corresponding to each - variable's zarr chunks. If False, zarr array data will lazily convert - to numpy arrays upon access. + chunks : int or dict or tuple or {None, 'auto'}, optional + Chunk sizes along each dimension, e.g., ``5`` or + ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created + based on the variable's zarr chunks. If `chunks=None`, zarr array + data will lazily convert to numpy arrays upon access. This accepts + all the chunk specifications as Dask does. + overwrite_encoded_chunks: bool, optional + Whether to drop the zarr chunks encoded for each variable when a + dataset is loaded with specified chunk sizes (default: False) decode_cf : bool, optional Whether to decode these variables, assuming they were saved according to CF conventions. @@ -422,6 +429,24 @@ def open_zarr(store, group=None, synchronizer=None, auto_chunk=True, ---------- http://zarr.readthedocs.io/ """ + if 'auto_chunk' in kwargs: + auto_chunk = kwargs.pop('auto_chunk') + if auto_chunk: + chunks = 'auto' # maintain backwards compatibility + else: + chunks = None + + warnings.warn("auto_chunk is deprecated. Use chunks='auto' instead.", + FutureWarning, stacklevel=2) + + if kwargs: + raise TypeError("open_zarr() got unexpected keyword arguments " + + ",".join(kwargs.keys())) + + if not isinstance(chunks, (int, dict)): + if chunks != 'auto' and chunks is not None: + raise ValueError("chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks) if not decode_cf: mask_and_scale = False @@ -449,21 +474,60 @@ def maybe_decode_store(store, lock=False): # auto chunking needs to be here and not in ZarrStore because variable # chunks do not survive decode_cf - if auto_chunk: - # adapted from Dataset.Chunk() - def maybe_chunk(name, var): - from dask.base import tokenize - chunks = var.encoding.get('chunks') - if (var.ndim > 0) and (chunks is not None): - # does this cause any data to be read? - token2 = tokenize(name, var._data) - name2 = 'zarr-%s' % token2 - return var.chunk(chunks, name=name2, lock=None) - else: - return var - - variables = OrderedDict([(k, maybe_chunk(k, v)) - for k, v in ds.variables.items()]) - return ds._replace_vars_and_dims(variables) - else: + # return trivial case + if not chunks: return ds + + # adapted from Dataset.Chunk() + if isinstance(chunks, int): + chunks = dict.fromkeys(ds.dims, chunks) + + if isinstance(chunks, tuple) and len(chunks) == len(ds.dims): + chunks = dict(zip(ds.dims, chunks)) + + def get_chunk(name, var, chunks): + chunk_spec = dict(zip(var.dims, var.encoding.get('chunks'))) + + # Coordinate labels aren't chunked + if var.ndim == 1 and var.dims[0] == name: + return chunk_spec + + if chunks == 'auto': + return chunk_spec + + for dim in var.dims: + if dim in chunks: + spec = chunks[dim] + if isinstance(spec, int): + spec = (spec,) + if isinstance(spec, (tuple, list)) and chunk_spec[dim]: + if any(s % chunk_spec[dim] for s in spec): + warnings.warn("Specified Dask chunks %r would " + "separate Zarr chunk shape %r for " + "dimension %r. This significantly " + "degrades performance. Consider " + "rechunking after loading instead." + % (chunks[dim], chunk_spec[dim], dim), + stacklevel=2) + chunk_spec[dim] = chunks[dim] + return chunk_spec + + def maybe_chunk(name, var, chunks): + from dask.base import tokenize + + chunk_spec = get_chunk(name, var, chunks) + + if (var.ndim > 0) and (chunk_spec is not None): + # does this cause any data to be read? + token2 = tokenize(name, var._data) + name2 = 'zarr-%s' % token2 + var = var.chunk(chunk_spec, name=name2, lock=None) + if overwrite_encoded_chunks and var.chunks is not None: + var.encoding['chunks'] = tuple(x[0] for x in var.chunks) + return var + else: + return var + + variables = OrderedDict([(k, maybe_chunk(k, v, chunks)) + for k, v in ds.variables.items()]) + return ds._replace_vars_and_dims(variables) diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index d724554b458..2ee38a20a4d 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -79,7 +79,7 @@ def get_date_type(calendar): class BaseCFTimeOffset(object): _freq = None # type: ClassVar[str] - _day_option = None + _day_option = None # type: ClassVar[str] def __init__(self, n=1): if not isinstance(n, int): diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index af08eef268f..642be735e9b 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -315,36 +315,51 @@ def reindex_variables( """ from .dataarray import DataArray + # create variables for the new dataset + reindexed = OrderedDict() # type: OrderedDict[Any, Variable] + # build up indexers for assignment along each dimension int_indexers = {} - targets = OrderedDict() # type: OrderedDict[Any, pd.Index] + new_indexes = OrderedDict(indexes) masked_dims = set() unchanged_dims = set() - # size of reindexed dimensions - new_sizes = {} + for dim, indexer in indexers.items(): + if isinstance(indexer, DataArray) and indexer.dims != (dim,): + warnings.warn( + "Indexer has dimensions {0:s} that are different " + "from that to be indexed along {1:s}. " + "This will behave differently in the future.".format( + str(indexer.dims), dim), + FutureWarning, stacklevel=3) + + target = new_indexes[dim] = utils.safe_cast_to_index(indexers[dim]) + + if dim in indexes: + index = indexes[dim] - for name, index in indexes.items(): - if name in indexers: if not index.is_unique: raise ValueError( 'cannot reindex or align along dimension %r because the ' - 'index has duplicate values' % name) - - target = utils.safe_cast_to_index(indexers[name]) - new_sizes[name] = len(target) + 'index has duplicate values' % dim) int_indexer = get_indexer_nd(index, target, method, tolerance) # We uses negative values from get_indexer_nd to signify # values that are missing in the index. if (int_indexer < 0).any(): - masked_dims.add(name) + masked_dims.add(dim) elif np.array_equal(int_indexer, np.arange(len(index))): - unchanged_dims.add(name) + unchanged_dims.add(dim) - int_indexers[name] = int_indexer - targets[name] = target + int_indexers[dim] = int_indexer + + if dim in variables: + var = variables[dim] + args = (var.attrs, var.encoding) # type: tuple + else: + args = () + reindexed[dim] = IndexVariable((dim,), target, *args) for dim in sizes: if dim not in indexes and dim in indexers: @@ -356,25 +371,6 @@ def reindex_variables( 'index because its size %r is different from the size of ' 'the new index %r' % (dim, existing_size, new_size)) - # create variables for the new dataset - reindexed = OrderedDict() # type: OrderedDict[Any, Variable] - - for dim, indexer in indexers.items(): - if isinstance(indexer, DataArray) and indexer.dims != (dim,): - warnings.warn( - "Indexer has dimensions {0:s} that are different " - "from that to be indexed along {1:s}. " - "This will behave differently in the future.".format( - str(indexer.dims), dim), - FutureWarning, stacklevel=3) - - if dim in variables: - var = variables[dim] - args = (var.attrs, var.encoding) # type: tuple - else: - args = () - reindexed[dim] = IndexVariable((dim,), indexers[dim], *args) - for name, var in variables.items(): if name not in indexers: key = tuple(slice(None) @@ -395,9 +391,6 @@ def reindex_variables( reindexed[name] = new_var - new_indexes = OrderedDict(indexes) - new_indexes.update(targets) - return reindexed, new_indexes diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index c24703f5384..a9e55159f57 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -231,9 +231,6 @@ def __init__(self, data, coords=None, dims=None, name=None, coords, dims = _infer_coords_and_dims(data.shape, coords, dims) variable = Variable(dims, data, attrs, encoding, fastpath=True) - # uncomment for a useful consistency check: - # assert all(isinstance(v, Variable) for v in coords.values()) - # These fully describe a DataArray self._variable = variable self._coords = coords diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index e3b2e3c3d2c..cf6631fa5ba 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -938,6 +938,7 @@ def _copy_listed(self: T, names) -> T: """ variables = OrderedDict() # type: OrderedDict[Any, Variable] coord_names = set() + indexes = OrderedDict() # type: OrderedDict[Any, pd.Index] for name in names: try: @@ -948,6 +949,8 @@ def _copy_listed(self: T, names) -> T: variables[var_name] = var if ref_name in self._coord_names or ref_name in self.dims: coord_names.add(var_name) + if (var_name,) == var.dims: + indexes[var_name] = var.to_index() needed_dims = set() # type: set for v in variables.values(): @@ -959,12 +962,8 @@ def _copy_listed(self: T, names) -> T: if set(self.variables[k].dims) <= needed_dims: variables[k] = self._variables[k] coord_names.add(k) - - if self._indexes is None: - indexes = None - else: - indexes = OrderedDict((k, v) for k, v in self._indexes.items() - if k in coord_names) + if k in self.indexes: + indexes[k] = self.indexes[k] return self._replace(variables, coord_names, dims, indexes=indexes) @@ -1503,9 +1502,13 @@ def _validate_indexers( raise ValueError("dimensions %r do not exist" % invalid) # all indexers should be int, slice, np.ndarrays, or Variable - indexers_list = [] + indexers_list = [] # type: List[Tuple[Any, Union[slice, Variable]]] for k, v in indexers.items(): - if isinstance(v, (slice, Variable)): + if isinstance(v, slice): + indexers_list.append((k, v)) + continue + + if isinstance(v, Variable): pass elif isinstance(v, DataArray): v = v.variable @@ -1524,14 +1527,19 @@ def _validate_indexers( v = _parse_array_of_cftime_strings(v, index.date_type) if v.ndim == 0: - v = as_variable(v) + v = Variable((), v) elif v.ndim == 1: - v = as_variable((k, v)) + v = IndexVariable((k,), v) else: raise IndexError( "Unlabeled multi-dimensional array cannot be " "used for indexing: {}".format(k)) + + if v.ndim == 1: + v = v.to_index_variable() + indexers_list.append((k, v)) + return indexers_list def _get_indexers_coords_and_indexes(self, indexers): @@ -1631,7 +1639,7 @@ def isel(self, indexers=None, drop=False, **indexers_kwargs): if name in self.indexes: new_var, new_index = isel_variable_and_index( - var, self.indexes[name], var_indexers) + name, var, self.indexes[name], var_indexers) if new_index is not None: indexes[name] = new_index else: @@ -2117,15 +2125,20 @@ def _validate_interp_indexer(x, new_x): indexes = OrderedDict( (k, v) for k, v in obj.indexes.items() if k not in indexers) selected = self._replace_with_new_dims( - variables, coord_names, indexes=indexes) + variables.copy(), coord_names, indexes=indexes) # attach indexer as coordinate variables.update(indexers) + indexes.update( + (k, v.to_index()) for k, v in indexers.items() if v.dims == (k,) + ) + # Extract coordinates from indexers coord_vars, new_indexes = ( selected._get_indexers_coords_and_indexes(coords)) variables.update(coord_vars) indexes.update(new_indexes) + coord_names = (set(variables) .intersection(obj._coord_names) .union(coord_vars)) @@ -2401,6 +2414,7 @@ def expand_dims(self, dim=None, axis=None, **dim_kwargs): ' variable name.'.format(dim=d)) variables = OrderedDict() + coord_names = self._coord_names.copy() # If dim is a dict, then ensure that the values are either integers # or iterables. for k, v in dim.items(): @@ -2410,7 +2424,7 @@ def expand_dims(self, dim=None, axis=None, **dim_kwargs): # value within the dim dict to the length of the iterable # for later use. variables[k] = xr.IndexVariable((k,), v) - self._coord_names.add(k) + coord_names.add(k) dim[k] = variables[k].size elif isinstance(v, int): pass # Do nothing if the dimensions value is just an int @@ -2420,7 +2434,7 @@ def expand_dims(self, dim=None, axis=None, **dim_kwargs): for k, v in self._variables.items(): if k not in dim: - if k in self._coord_names: # Do not change coordinates + if k in coord_names: # Do not change coordinates variables[k] = v else: result_ndim = len(v.dims) + len(axis) @@ -2452,10 +2466,10 @@ def expand_dims(self, dim=None, axis=None, **dim_kwargs): variables[k] = v.set_dims(k) new_dims = self._dims.copy() - for d in dim: - new_dims[d] = 1 + new_dims.update(dim) - return self._replace(variables, dims=new_dims) + return self._replace_vars_and_dims( + variables, dims=new_dims, coord_names=coord_names) def set_index(self, indexes=None, append=False, inplace=None, **indexes_kwargs): diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 6d8b553036a..eccb72b6a58 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -1,6 +1,6 @@ import collections.abc from collections import OrderedDict -from typing import Any, Iterable, Mapping, Optional, Tuple, Union +from typing import Any, Hashable, Iterable, Mapping, Optional, Tuple, Union import pandas as pd @@ -59,6 +59,7 @@ def default_indexes( def isel_variable_and_index( + name: Hashable, variable: Variable, index: pd.Index, indexers: Mapping[Any, Union[slice, Variable]], @@ -75,8 +76,8 @@ def isel_variable_and_index( new_variable = variable.isel(indexers) - if new_variable.ndim != 1: - # can't preserve a index if result is not 0D + if new_variable.dims != (name,): + # can't preserve a index if result has new dimensions return new_variable, None # we need to compute the new index diff --git a/xarray/testing.py b/xarray/testing.py index 794c0614925..eb8a0e8603d 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -1,8 +1,12 @@ """Testing functions exposed to the user API""" +from collections import OrderedDict + import numpy as np +import pandas as pd from xarray.core import duck_array_ops from xarray.core import formatting +from xarray.core.indexes import default_indexes def _decode_string_data(data): @@ -143,8 +147,37 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): .format(type(a))) -def assert_combined_tile_ids_equal(dict1, dict2): - assert len(dict1) == len(dict2) - for k, v in dict1.items(): - assert k in dict2.keys() - assert_equal(dict1[k], dict2[k]) +def _assert_indexes_invariants_checks(indexes, possible_coord_variables, dims): + import xarray as xr + + assert isinstance(indexes, OrderedDict), indexes + assert all(isinstance(v, pd.Index) for v in indexes.values()), \ + {k: type(v) for k, v in indexes.items()} + + index_vars = {k for k, v in possible_coord_variables.items() + if isinstance(v, xr.IndexVariable)} + assert indexes.keys() <= index_vars, (set(indexes), index_vars) + + # Note: when we support non-default indexes, these checks should be opt-in + # only! + defaults = default_indexes(possible_coord_variables, dims) + assert indexes.keys() == defaults.keys(), \ + (set(indexes), set(defaults)) + assert all(v.equals(defaults[k]) for k, v in indexes.items()), \ + (indexes, defaults) + + +def _assert_indexes_invariants(a): + """Separate helper function for checking indexes invariants only.""" + import xarray as xr + + if isinstance(a, xr.DataArray): + if a._indexes is not None: + _assert_indexes_invariants_checks(a._indexes, a._coords, a.dims) + elif isinstance(a, xr.Dataset): + if a._indexes is not None: + _assert_indexes_invariants_checks( + a._indexes, a._variables, a._dims) + elif isinstance(a, xr.Variable): + # no indexes + pass diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 4ebcc29a61e..525360701fe 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -13,8 +13,7 @@ from xarray.core import utils from xarray.core.options import set_options from xarray.core.indexing import ExplicitlyIndexed -from xarray.testing import (assert_equal, assert_identical, # noqa: F401 - assert_allclose, assert_combined_tile_ids_equal) +import xarray.testing from xarray.plot.utils import import_seaborn try: @@ -180,3 +179,25 @@ def source_ndarray(array): if base is None: base = array return base + + +# Internal versions of xarray's test functions that validate additional +# invariants +# TODO: add more invariant checks. + +def assert_equal(a, b): + xarray.testing.assert_equal(a, b) + xarray.testing._assert_indexes_invariants(a) + xarray.testing._assert_indexes_invariants(b) + + +def assert_identical(a, b): + xarray.testing.assert_identical(a, b) + xarray.testing._assert_indexes_invariants(a) + xarray.testing._assert_indexes_invariants(b) + + +def assert_allclose(a, b, **kwargs): + xarray.testing.assert_allclose(a, b, **kwargs) + xarray.testing._assert_indexes_invariants(a) + xarray.testing._assert_indexes_invariants(b) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a20ba2df229..bf40e529931 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1391,7 +1391,7 @@ def test_auto_chunk(self): original = create_test_data().chunk() with self.roundtrip( - original, open_kwargs={'auto_chunk': False}) as actual: + original, open_kwargs={'chunks': None}) as actual: for k, v in actual.variables.items(): # only index variables should be in memory assert v._in_memory == (k in actual.dims) @@ -1399,19 +1399,101 @@ def test_auto_chunk(self): assert v.chunks is None with self.roundtrip( - original, open_kwargs={'auto_chunk': True}) as actual: + original, open_kwargs={'chunks': 'auto'}) as actual: for k, v in actual.variables.items(): # only index variables should be in memory assert v._in_memory == (k in actual.dims) # chunk size should be the same as original assert v.chunks == original[k].chunks + def test_manual_chunk(self): + original = create_test_data().chunk({'dim1': 3, 'dim2': 4, 'dim3': 3}) + + # All of these should return non-chunked arrays + NO_CHUNKS = (None, 0, {}) + for no_chunk in NO_CHUNKS: + open_kwargs = {'chunks': no_chunk} + with self.roundtrip(original, open_kwargs=open_kwargs) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + # there should be no chunks + assert v.chunks is None + + # uniform arrays + for i in range(2, 6): + rechunked = original.chunk(chunks=i) + open_kwargs = {'chunks': i} + with self.roundtrip(original, open_kwargs=open_kwargs) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + # chunk size should be the same as rechunked + assert v.chunks == rechunked[k].chunks + + chunks = {'dim1': 2, 'dim2': 3, 'dim3': 5} + rechunked = original.chunk(chunks=chunks) + + open_kwargs = {'chunks': chunks, 'overwrite_encoded_chunks': True} + with self.roundtrip(original, open_kwargs=open_kwargs) as actual: + for k, v in actual.variables.items(): + assert v.chunks == rechunked[k].chunks + + with self.roundtrip(actual) as auto: + # encoding should have changed + for k, v in actual.variables.items(): + assert v.chunks == rechunked[k].chunks + + assert_identical(actual, auto) + assert_identical(actual.load(), auto.load()) + + def test_warning_on_bad_chunks(self): + original = create_test_data().chunk({'dim1': 4, 'dim2': 3, 'dim3': 5}) + + bad_chunks = (2, {'dim2': (3, 3, 2, 1)}) + for chunks in bad_chunks: + kwargs = {'chunks': chunks} + with pytest.warns(UserWarning): + with self.roundtrip(original, open_kwargs=kwargs) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + + good_chunks = ({'dim2': 3}, {'dim3': 10}) + for chunks in good_chunks: + kwargs = {'chunks': chunks} + with pytest.warns(None) as record: + with self.roundtrip(original, open_kwargs=kwargs) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + assert len(record) == 0 + + def test_deprecate_auto_chunk(self): + original = create_test_data().chunk() + with pytest.warns(FutureWarning): + with self.roundtrip( + original, open_kwargs={'auto_chunk': True}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + # chunk size should be the same as original + assert v.chunks == original[k].chunks + + with pytest.warns(FutureWarning): + with self.roundtrip( + original, open_kwargs={'auto_chunk': False}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + # there should be no chunks + assert v.chunks is None + def test_write_uneven_dask_chunks(self): # regression for GH#2225 original = create_test_data().chunk({'dim1': 3, 'dim2': 4, 'dim3': 3}) - with self.roundtrip( - original, open_kwargs={'auto_chunk': True}) as actual: + original, open_kwargs={'chunks': 'auto'}) as actual: for k, v in actual.data_vars.items(): print(k) assert v.chunks == actual[k].chunks diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 0d03b6e0cdf..6d0f4626086 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -13,7 +13,7 @@ _infer_tile_ids_from_nested_list, _new_tile_id) from . import ( - InaccessibleArray, assert_array_equal, assert_combined_tile_ids_equal, + InaccessibleArray, assert_array_equal, assert_equal, assert_identical, raises_regex, requires_dask) from .test_dataset import create_test_data @@ -418,6 +418,13 @@ def test_auto_combine_no_concat(self): assert_identical(expected, actual) +def assert_combined_tile_ids_equal(dict1, dict2): + assert len(dict1) == len(dict2) + for k, v in dict1.items(): + assert k in dict2.keys() + assert_equal(dict1[k], dict2[k]) + + class TestTileIDsFromNestedList(object): def test_1d(self): ds = create_test_data diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index ab83d385ef4..3ace80f5eea 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2053,12 +2053,8 @@ def test_expand_dims_error(self): if python36_plus: with raises_regex(ValueError, 'both keyword and positional'): original.expand_dims(OrderedDict((("d", 4),)), e=4) - else: - # In python 3.5, using dim_kwargs should raise a ValueError. - with raises_regex(ValueError, "dim_kwargs isn't"): - original.expand_dims(OrderedDict((("d", 4),)), e=4) - def test_expand_dims(self): + def test_expand_dims_int(self): original = Dataset({'x': ('a', np.random.randn(3)), 'y': (['b', 'a'], np.random.randn(4, 3))}, coords={'a': np.linspace(0, 1, 3), @@ -2091,9 +2087,37 @@ def test_expand_dims(self): roundtripped = actual.squeeze('z') assert_identical(original, roundtripped) + def test_expand_dims_coords(self): + original = Dataset({'x': ('a', np.array([1, 2, 3]))}) + expected = Dataset( + {'x': (('b', 'a'), np.array([[1, 2, 3], [1, 2, 3]]))}, + coords={'b': [1, 2]}, + ) + actual = original.expand_dims(OrderedDict(b=[1, 2])) + assert_identical(expected, actual) + assert 'b' not in original._coord_names + + def test_expand_dims_existing_scalar_coord(self): + original = Dataset({'x': 1}, {'a': 2}) + expected = Dataset({'x': (('a',), [1])}, {'a': [2]}) + actual = original.expand_dims('a') + assert_identical(expected, actual) + + def test_isel_expand_dims_roundtrip(self): + original = Dataset({'x': (('a',), [1])}, {'a': [2]}) + actual = original.isel(a=0).expand_dims('a') + assert_identical(actual, original) + + def test_expand_dims_mixed_int_and_coords(self): # Test expanding one dimension to have size > 1 that doesn't have # coordinates, and also expanding another dimension to have size > 1 # that DOES have coordinates. + original = Dataset({'x': ('a', np.random.randn(3)), + 'y': (['b', 'a'], np.random.randn(4, 3))}, + coords={'a': np.linspace(0, 1, 3), + 'b': np.linspace(0, 1, 4), + 'c': np.linspace(0, 1, 5)}) + actual = original.expand_dims( OrderedDict((("d", 4), ("e", ["l", "m", "n"])))) @@ -2109,34 +2133,45 @@ def test_expand_dims(self): b=np.linspace(0, 1, 4), a=np.linspace(0, 1, 3)), dims=['d', 'e', 'b', 'a']).drop('d')}, - coords={'c': np.linspace(0, 1, 5)}, - attrs={'key': 'entry'}) + coords={'c': np.linspace(0, 1, 5)}) assert_identical(actual, expected) - # Test with kwargs instead of passing dict to dim arg. - - # TODO: only the code under the if-statement is needed when python 3.5 - # is no longer supported. - python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 - if python36_plus: - other_way = original.expand_dims(e=["l", "m", "n"]) - other_way_expected = Dataset( - {'x': xr.DataArray(original['x'].values * np.ones([3, 3]), - coords=dict(e=['l', 'm', 'n'], - a=np.linspace(0, 1, 3)), - dims=['e', 'a']), - 'y': xr.DataArray(original['y'].values * np.ones([3, 4, 3]), - coords=dict(e=['l', 'm', 'n'], - b=np.linspace(0, 1, 4), - a=np.linspace(0, 1, 3)), - dims=['e', 'b', 'a'])}, - coords={'c': np.linspace(0, 1, 5)}, - attrs={'key': 'entry'}) - assert_identical(other_way_expected, other_way) - else: - # In python 3.5, using dim_kwargs should raise a ValueError. - with raises_regex(ValueError, "dim_kwargs isn't"): - original.expand_dims(e=["l", "m", "n"]) + @pytest.mark.skipif( + sys.version_info[:2] > (3, 5), + reason="we only raise these errors for Python 3.5", + ) + def test_expand_dims_kwargs_python35(self): + original = Dataset({'x': ('a', np.random.randn(3))}) + with raises_regex(ValueError, "dim_kwargs isn't"): + original.expand_dims(e=["l", "m", "n"]) + with raises_regex(TypeError, "must be an OrderedDict"): + original.expand_dims({'e': ["l", "m", "n"]}) + + @pytest.mark.skipif( + sys.version_info[:2] < (3, 6), + reason='keyword arguments are only ordered on Python 3.6+', + ) + def test_expand_dims_kwargs_python36plus(self): + original = Dataset({'x': ('a', np.random.randn(3)), + 'y': (['b', 'a'], np.random.randn(4, 3))}, + coords={'a': np.linspace(0, 1, 3), + 'b': np.linspace(0, 1, 4), + 'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + other_way = original.expand_dims(e=["l", "m", "n"]) + other_way_expected = Dataset( + {'x': xr.DataArray(original['x'].values * np.ones([3, 3]), + coords=dict(e=['l', 'm', 'n'], + a=np.linspace(0, 1, 3)), + dims=['e', 'a']), + 'y': xr.DataArray(original['y'].values * np.ones([3, 4, 3]), + coords=dict(e=['l', 'm', 'n'], + b=np.linspace(0, 1, 4), + a=np.linspace(0, 1, 3)), + dims=['e', 'b', 'a'])}, + coords={'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + assert_identical(other_way_expected, other_way) def test_set_index(self): expected = create_test_multiindex() diff --git a/xarray/tests/test_interp.py b/xarray/tests/test_interp.py index 5596bfb3bfb..8347d54bd1e 100644 --- a/xarray/tests/test_interp.py +++ b/xarray/tests/test_interp.py @@ -291,7 +291,7 @@ def test_errors(use_dask): if use_dask: da = get_example_data(3) else: - da = get_example_data(1) + da = get_example_data(0) result = da.interp(x=[-1, 1, 3], kwargs={'fill_value': 0.0}) assert not np.isnan(result.values).any()