From c7c990b4929dd0082e6975e6cc502c5dd6c988e6 Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Thu, 1 Nov 2018 05:52:13 +1100 Subject: [PATCH 01/30] added manual chunks for open_zarr --- xarray/backends/zarr.py | 78 +++++++++++++++++++++++++---------- xarray/tests/test_backends.py | 46 +++++++++++++++++++++ 2 files changed, 103 insertions(+), 21 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index ee77e0833c4..a5ee9867f8e 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -352,10 +352,11 @@ def close(self): zarr.consolidate_metadata(self.ds.store) -def open_zarr(store, group=None, synchronizer=None, auto_chunk=True, +def open_zarr(store, group=None, synchronizer=None, chunks=None, decode_cf=True, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, - drop_variables=None, consolidated=False): + drop_variables=None, consolidated=False, auto_chunk=True, + overwrite_encoded_chunks=False): """Load and decode a dataset from a Zarr store. .. note:: Experimental @@ -375,10 +376,19 @@ def open_zarr(store, group=None, synchronizer=None, auto_chunk=True, Array synchronizer provided to zarr group : str, obtional Group path. (a.k.a. `path` in zarr terminology.) + chunks : int or dict or {None, 'auto'}, optional + Chunk sizes along each dimension, e.g., ``5`` or + ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created + based on the variable's zarr chunks. If `chunks=None` and + `auto_chunk=False`, zarr array data will lazily convert to numpy + arrays upon access. auto_chunk : bool, optional Whether to automatically create dask chunks corresponding to each - variable's zarr chunks. If False, zarr array data will lazily convert - to numpy arrays upon access. + variable's zarr chunks. If `chunks=None`, this overrides `chunks`. + Equivalent to `chunks='auto'.` (Default: True) + overwrite_encoded_chunks: bool, optional + Whether to drop the zarr chunks encoded for each variable when a + dataset is loaded with specified chunk sizes (default: False) decode_cf : bool, optional Whether to decode these variables, assuming they were saved according to CF conventions. @@ -423,6 +433,15 @@ def open_zarr(store, group=None, synchronizer=None, auto_chunk=True, http://zarr.readthedocs.io/ """ + if auto_chunk and chunks is None: + chunks = 'auto' # maintain backwards compatibility + + if not isinstance(chunks, (int, dict)): + if chunks != 'auto' and chunks is not None: + raise ValueError( + "chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks) + if not decode_cf: mask_and_scale = False decode_times = False @@ -449,21 +468,38 @@ def maybe_decode_store(store, lock=False): # auto chunking needs to be here and not in ZarrStore because variable # chunks do not survive decode_cf - if auto_chunk: - # adapted from Dataset.Chunk() - def maybe_chunk(name, var): - from dask.base import tokenize - chunks = var.encoding.get('chunks') - if (var.ndim > 0) and (chunks is not None): - # does this cause any data to be read? - token2 = tokenize(name, var._data) - name2 = 'zarr-%s' % token2 - return var.chunk(chunks, name=name2, lock=None) - else: - return var - - variables = OrderedDict([(k, maybe_chunk(k, v)) - for k, v in ds.variables.items()]) - return ds._replace_vars_and_dims(variables) - else: + # return trivial case + if not chunks: return ds + + # adapted from Dataset.Chunk() + if isinstance(chunks, int): + chunks = dict.fromkeys(ds.dims, chunks) + + def selkeys(dict_, keys): + if dict_ is None: + return None + return dict((d, dict_[d]) for d in keys if d in dict_) + + def maybe_chunk(name, var, chunks): + from dask.base import tokenize + + if chunks == 'auto': + chunks = var.encoding.get('chunks') + else: + chunks = selkeys(chunks, var.dims) + + if (var.ndim > 0) and (chunks is not None): + # does this cause any data to be read? + token2 = tokenize(name, var._data) + name2 = 'zarr-%s' % token2 + var = var.chunk(chunks, name=name2, lock=None) + if overwrite_encoded_chunks and var.chunks is not None: + var.encoding['chunks'] = tuple(x[0] for x in var.chunks) + return var + else: + return var + + variables = OrderedDict([(k, maybe_chunk(k, v, chunks)) + for k, v in ds.variables.items()]) + return ds._replace_vars_and_dims(variables) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a20ba2df229..5f21027962f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1405,6 +1405,52 @@ def test_auto_chunk(self): assert v._in_memory == (k in actual.dims) # chunk size should be the same as original assert v.chunks == original[k].chunks + + def test_manual_chunk(self): + original = create_test_data().chunk({'dim1': 3, 'dim2': 4, 'dim3': 3}) + + # All of these should return non-chunked arrays + NO_CHUNKS = (None, 0, {}) + for no_chunk in NO_CHUNKS: + with self.roundtrip( + original, open_kwargs={'chunks': no_chunk, + 'auto_chunk': False}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + # there should be no chunks + assert v.chunks is None + + # uniform arrays + for i in range(2, 6): + rechunked = original.chunk(chunks=i) + + with self.roundtrip( + original, open_kwargs={'chunks': i}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + # chunk size should be the same as rechunked + assert v.chunks == rechunked[k].chunks + + chunks = {'dim1': 2, 'dim2': 3, 'dim3': 5} + rechunked = original.chunk(chunks=chunks) + + open_overwritten = {'chunks': chunks, + 'overwrite_encoded_chunks': True} + + with self.roundtrip( + original, open_kwargs=open_overwritten) as actual: + for k, v in actual.variables.items(): + assert v.chunks == rechunked[k].chunks + + with self.roundtrip(actual, open_kwargs={'chunks': 'auto'}) as auto: + # encoding should have changed + for k, v in actual.variables.items(): + assert v.chunks == rechunked[k].chunks + + assert_identical(actual, auto) + assert_identical(actual.load(), auto.load()) def test_write_uneven_dask_chunks(self): # regression for GH#2225 From d37d9e1572dff5b0642aa79c5aac2afc21e4a00d Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Thu, 1 Nov 2018 05:53:03 +1100 Subject: [PATCH 02/30] updated whats-new --- doc/whats-new.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9c88445b5ba..86d30ce42ad 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -428,6 +428,12 @@ Bug fixes encoding process if a reference date is used that is so distant that the dates must be encoded using cftime rather than NumPy (:issue:`2272`). By `Spencer Clark `_. + +- ``xr.open_zarr`` now accepts manually specified chunks with the ``chunks=`` + parameter. ``auto_chunk=True`` is equivalent to ``chunks='auto'`` for + backwards compatibility. The ``overwrite_encoded_chunks`` parameter is + added to remove the original zarr chunk encoding. + By `Lily Wang `_. - Chunked datasets can now roundtrip to Zarr storage continually with `to_zarr` and ``open_zarr`` (:issue:`2300`). From f3c829e412fbc6ff94455d0499c2b4607e214190 Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Thu, 1 Nov 2018 06:14:23 +1100 Subject: [PATCH 03/30] fixed pep8 issues --- xarray/backends/zarr.py | 23 +++++++++++------------ xarray/tests/test_backends.py | 25 ++++++++++--------------- 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index a5ee9867f8e..89f55400e82 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -378,16 +378,16 @@ def open_zarr(store, group=None, synchronizer=None, chunks=None, Group path. (a.k.a. `path` in zarr terminology.) chunks : int or dict or {None, 'auto'}, optional Chunk sizes along each dimension, e.g., ``5`` or - ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created - based on the variable's zarr chunks. If `chunks=None` and - `auto_chunk=False`, zarr array data will lazily convert to numpy - arrays upon access. + ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created + based on the variable's zarr chunks. If `chunks=None` and + `auto_chunk=False`, zarr array data will lazily convert to numpy + arrays upon access. auto_chunk : bool, optional Whether to automatically create dask chunks corresponding to each - variable's zarr chunks. If `chunks=None`, this overrides `chunks`. + variable's zarr chunks. If `chunks=None`, this overrides `chunks`. Equivalent to `chunks='auto'.` (Default: True) overwrite_encoded_chunks: bool, optional - Whether to drop the zarr chunks encoded for each variable when a + Whether to drop the zarr chunks encoded for each variable when a dataset is loaded with specified chunk sizes (default: False) decode_cf : bool, optional Whether to decode these variables, assuming they were saved according @@ -434,12 +434,11 @@ def open_zarr(store, group=None, synchronizer=None, chunks=None, """ if auto_chunk and chunks is None: - chunks = 'auto' # maintain backwards compatibility + chunks = 'auto' # maintain backwards compatibility if not isinstance(chunks, (int, dict)): if chunks != 'auto' and chunks is not None: - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " + raise ValueError("chunks must be an int, dict, 'auto', or None. " "Instead found %s. " % chunks) if not decode_cf: @@ -471,7 +470,7 @@ def maybe_decode_store(store, lock=False): # return trivial case if not chunks: return ds - + # adapted from Dataset.Chunk() if isinstance(chunks, int): chunks = dict.fromkeys(ds.dims, chunks) @@ -488,7 +487,7 @@ def maybe_chunk(name, var, chunks): chunks = var.encoding.get('chunks') else: chunks = selkeys(chunks, var.dims) - + if (var.ndim > 0) and (chunks is not None): # does this cause any data to be read? token2 = tokenize(name, var._data) @@ -501,5 +500,5 @@ def maybe_chunk(name, var, chunks): return var variables = OrderedDict([(k, maybe_chunk(k, v, chunks)) - for k, v in ds.variables.items()]) + for k, v in ds.variables.items()]) return ds._replace_vars_and_dims(variables) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5f21027962f..5f555dde2d9 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1412,9 +1412,8 @@ def test_manual_chunk(self): # All of these should return non-chunked arrays NO_CHUNKS = (None, 0, {}) for no_chunk in NO_CHUNKS: - with self.roundtrip( - original, open_kwargs={'chunks': no_chunk, - 'auto_chunk': False}) as actual: + open_kwargs = {'chunks': no_chunk, 'auto_chunk': False} + with self.roundtrip(original, open_kwargs=open_kwargs) as actual: for k, v in actual.variables.items(): # only index variables should be in memory assert v._in_memory == (k in actual.dims) @@ -1424,9 +1423,8 @@ def test_manual_chunk(self): # uniform arrays for i in range(2, 6): rechunked = original.chunk(chunks=i) - - with self.roundtrip( - original, open_kwargs={'chunks': i}) as actual: + open_kwargs = {'chunks': i} + with self.roundtrip(original, open_kwargs=open_kwargs) as actual: for k, v in actual.variables.items(): # only index variables should be in memory assert v._in_memory == (k in actual.dims) @@ -1435,20 +1433,17 @@ def test_manual_chunk(self): chunks = {'dim1': 2, 'dim2': 3, 'dim3': 5} rechunked = original.chunk(chunks=chunks) - - open_overwritten = {'chunks': chunks, - 'overwrite_encoded_chunks': True} - - with self.roundtrip( - original, open_kwargs=open_overwritten) as actual: + + open_kwargs = {'chunks': chunks, 'overwrite_encoded_chunks': True} + with self.roundtrip(original, open_kwargs=open_kwargs) as actual: for k, v in actual.variables.items(): assert v.chunks == rechunked[k].chunks - - with self.roundtrip(actual, open_kwargs={'chunks': 'auto'}) as auto: + + with self.roundtrip(actual) as auto: # encoding should have changed for k, v in actual.variables.items(): assert v.chunks == rechunked[k].chunks - + assert_identical(actual, auto) assert_identical(actual.load(), auto.load()) From 36f253fa2f67d8f9dbe0d141f65844b8369d5e64 Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Thu, 1 Nov 2018 06:18:37 +1100 Subject: [PATCH 04/30] removed whitespace --- xarray/backends/zarr.py | 8 ++++---- xarray/tests/test_backends.py | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 89f55400e82..7ed26e31151 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -439,7 +439,7 @@ def open_zarr(store, group=None, synchronizer=None, chunks=None, if not isinstance(chunks, (int, dict)): if chunks != 'auto' and chunks is not None: raise ValueError("chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks) + "Instead found %s. " % chunks) if not decode_cf: mask_and_scale = False @@ -474,12 +474,12 @@ def maybe_decode_store(store, lock=False): # adapted from Dataset.Chunk() if isinstance(chunks, int): chunks = dict.fromkeys(ds.dims, chunks) - + def selkeys(dict_, keys): if dict_ is None: return None return dict((d, dict_[d]) for d in keys if d in dict_) - + def maybe_chunk(name, var, chunks): from dask.base import tokenize @@ -487,7 +487,7 @@ def maybe_chunk(name, var, chunks): chunks = var.encoding.get('chunks') else: chunks = selkeys(chunks, var.dims) - + if (var.ndim > 0) and (chunks is not None): # does this cause any data to be read? token2 = tokenize(name, var._data) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5f555dde2d9..01ef1caea38 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1405,10 +1405,10 @@ def test_auto_chunk(self): assert v._in_memory == (k in actual.dims) # chunk size should be the same as original assert v.chunks == original[k].chunks - + def test_manual_chunk(self): original = create_test_data().chunk({'dim1': 3, 'dim2': 4, 'dim3': 3}) - + # All of these should return non-chunked arrays NO_CHUNKS = (None, 0, {}) for no_chunk in NO_CHUNKS: @@ -1430,20 +1430,20 @@ def test_manual_chunk(self): assert v._in_memory == (k in actual.dims) # chunk size should be the same as rechunked assert v.chunks == rechunked[k].chunks - + chunks = {'dim1': 2, 'dim2': 3, 'dim3': 5} rechunked = original.chunk(chunks=chunks) - + open_kwargs = {'chunks': chunks, 'overwrite_encoded_chunks': True} with self.roundtrip(original, open_kwargs=open_kwargs) as actual: for k, v in actual.variables.items(): assert v.chunks == rechunked[k].chunks - + with self.roundtrip(actual) as auto: # encoding should have changed for k, v in actual.variables.items(): assert v.chunks == rechunked[k].chunks - + assert_identical(actual, auto) assert_identical(actual.load(), auto.load()) From ae4cf0ab19b3e563bde90a48b3e6ee615930d4a1 Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Wed, 7 Nov 2018 13:22:51 +1100 Subject: [PATCH 05/30] added deprecation warning --- xarray/backends/zarr.py | 34 +++++++++++++++++++++++----------- xarray/tests/test_backends.py | 30 +++++++++++++++++++++++++----- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 7ed26e31151..302301248d3 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,4 +1,11 @@ +<<<<<<< HEAD from collections import OrderedDict +======= +from __future__ import absolute_import, division, print_function + +import warnings + +>>>>>>> added deprecation warning from distutils.version import LooseVersion import numpy as np @@ -352,7 +359,7 @@ def close(self): zarr.consolidate_metadata(self.ds.store) -def open_zarr(store, group=None, synchronizer=None, chunks=None, +def open_zarr(store, group=None, synchronizer=None, chunks='auto', decode_cf=True, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, drop_variables=None, consolidated=False, auto_chunk=True, @@ -379,13 +386,8 @@ def open_zarr(store, group=None, synchronizer=None, chunks=None, chunks : int or dict or {None, 'auto'}, optional Chunk sizes along each dimension, e.g., ``5`` or ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created - based on the variable's zarr chunks. If `chunks=None` and - `auto_chunk=False`, zarr array data will lazily convert to numpy - arrays upon access. - auto_chunk : bool, optional - Whether to automatically create dask chunks corresponding to each - variable's zarr chunks. If `chunks=None`, this overrides `chunks`. - Equivalent to `chunks='auto'.` (Default: True) + based on the variable's zarr chunks. If `chunks=None`, zarr array + data will lazily convert to numpy arrays upon access. overwrite_encoded_chunks: bool, optional Whether to drop the zarr chunks encoded for each variable when a dataset is loaded with specified chunk sizes (default: False) @@ -432,9 +434,19 @@ def open_zarr(store, group=None, synchronizer=None, chunks=None, ---------- http://zarr.readthedocs.io/ """ - - if auto_chunk and chunks is None: - chunks = 'auto' # maintain backwards compatibility + if 'auto_chunk' in kwargs: + auto_chunk = kwargs.pop('auto_chunk') + if auto_chunk == True: + chunks = 'auto' # maintain backwards compatibility + elif auto_chunk == False: + chunks = None + + warnings.warn("auto_chunk is deprecated. Use chunks='auto' instead.", + FutureWarning, stacklevel=2) + + if kwargs: + raise TypeError("open_zarr() got unexpected keyword arguments " + + ",".join(kwargs.keys())) if not isinstance(chunks, (int, dict)): if chunks != 'auto' and chunks is not None: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 01ef1caea38..21e5518f6de 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1391,7 +1391,7 @@ def test_auto_chunk(self): original = create_test_data().chunk() with self.roundtrip( - original, open_kwargs={'auto_chunk': False}) as actual: + original, open_kwargs={'chunks': None}) as actual: for k, v in actual.variables.items(): # only index variables should be in memory assert v._in_memory == (k in actual.dims) @@ -1399,7 +1399,7 @@ def test_auto_chunk(self): assert v.chunks is None with self.roundtrip( - original, open_kwargs={'auto_chunk': True}) as actual: + original, open_kwargs={'chunks': 'auto'}) as actual: for k, v in actual.variables.items(): # only index variables should be in memory assert v._in_memory == (k in actual.dims) @@ -1412,7 +1412,7 @@ def test_manual_chunk(self): # All of these should return non-chunked arrays NO_CHUNKS = (None, 0, {}) for no_chunk in NO_CHUNKS: - open_kwargs = {'chunks': no_chunk, 'auto_chunk': False} + open_kwargs = {'chunks': no_chunk} with self.roundtrip(original, open_kwargs=open_kwargs) as actual: for k, v in actual.variables.items(): # only index variables should be in memory @@ -1446,13 +1446,33 @@ def test_manual_chunk(self): assert_identical(actual, auto) assert_identical(actual.load(), auto.load()) + + def test_deprecate_auto_chunk(self): + original = create_test_data().chunk() + with pytest.warns(FutureWarning): + with self.roundtrip( + original, open_kwargs={'auto_chunk': True}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + # chunk size should be the same as original + assert v.chunks == original[k].chunks + + with pytest.warns(FutureWarning): + with self.roundtrip( + original, open_kwargs={'auto_chunk': False}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + # there should be no chunks + assert v.chunks is None + def test_write_uneven_dask_chunks(self): # regression for GH#2225 original = create_test_data().chunk({'dim1': 3, 'dim2': 4, 'dim3': 3}) - with self.roundtrip( - original, open_kwargs={'auto_chunk': True}) as actual: + original, open_kwargs={'chunks': 'auto'}) as actual: for k, v in actual.data_vars.items(): print(k) assert v.chunks == actual[k].chunks From cccfd046dd1b1e2bd1965b0c2a271ec4adebf86e Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Wed, 7 Nov 2018 13:34:44 +1100 Subject: [PATCH 06/30] fixed pep8 issues --- xarray/backends/zarr.py | 4 ++-- xarray/tests/test_backends.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 302301248d3..14119660ad0 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -436,9 +436,9 @@ def open_zarr(store, group=None, synchronizer=None, chunks='auto', """ if 'auto_chunk' in kwargs: auto_chunk = kwargs.pop('auto_chunk') - if auto_chunk == True: + if auto_chunk: chunks = 'auto' # maintain backwards compatibility - elif auto_chunk == False: + else: chunks = None warnings.warn("auto_chunk is deprecated. Use chunks='auto' instead.", diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 21e5518f6de..3dac12b5727 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1446,27 +1446,27 @@ def test_manual_chunk(self): assert_identical(actual, auto) assert_identical(actual.load(), auto.load()) - + def test_deprecate_auto_chunk(self): original = create_test_data().chunk() with pytest.warns(FutureWarning): with self.roundtrip( - original, open_kwargs={'auto_chunk': True}) as actual: + original, open_kwargs={'auto_chunk': True}) as actual: for k, v in actual.variables.items(): # only index variables should be in memory assert v._in_memory == (k in actual.dims) # chunk size should be the same as original assert v.chunks == original[k].chunks - + with pytest.warns(FutureWarning): with self.roundtrip( - original, open_kwargs={'auto_chunk': False}) as actual: + original, open_kwargs={'auto_chunk': False}) as actual: for k, v in actual.variables.items(): # only index variables should be in memory assert v._in_memory == (k in actual.dims) # there should be no chunks assert v.chunks is None - + def test_write_uneven_dask_chunks(self): # regression for GH#2225 From da45d77d06897d71df61ed39d5c1d9ebe6256dbf Mon Sep 17 00:00:00 2001 From: Lily Date: Wed, 30 Jan 2019 11:06:35 +1100 Subject: [PATCH 07/30] added warning for bad chunks --- xarray/backends/zarr.py | 47 ++++++++++++++++++++++++++--------- xarray/tests/test_backends.py | 24 ++++++++++++++++++ 2 files changed, 59 insertions(+), 12 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 14119660ad0..9f10c6a3bb7 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -383,11 +383,12 @@ def open_zarr(store, group=None, synchronizer=None, chunks='auto', Array synchronizer provided to zarr group : str, obtional Group path. (a.k.a. `path` in zarr terminology.) - chunks : int or dict or {None, 'auto'}, optional + chunks : int or dict or tuple or {None, 'auto'}, optional Chunk sizes along each dimension, e.g., ``5`` or ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created based on the variable's zarr chunks. If `chunks=None`, zarr array - data will lazily convert to numpy arrays upon access. + data will lazily convert to numpy arrays upon access. This accepts + all the chunk specifications as Dask does. overwrite_encoded_chunks: bool, optional Whether to drop the zarr chunks encoded for each variable when a dataset is loaded with specified chunk sizes (default: False) @@ -486,25 +487,47 @@ def maybe_decode_store(store, lock=False): # adapted from Dataset.Chunk() if isinstance(chunks, int): chunks = dict.fromkeys(ds.dims, chunks) + + if isinstance(chunks, tuple) and len(chunks) == len(ds.dims): + chunks = dict(zip(ds.dims, chunks)) + + def get_chunk(name, var, chunks): + chunk_spec = dict(zip(var.dims, var.encoding.get('chunks'))) + + # Coordinate labels aren't chunked + if var.ndim == 1 and var.dims[0] == name: + return chunk_spec + + if chunks == 'auto': + return chunk_spec + + for dim in var.dims: + if dim in chunks: + spec = chunks[dim] + if isinstance(spec, int): + spec = (spec,) + if isinstance(spec, (tuple, list)) and chunk_spec[dim]: + if any(s % chunk_spec[dim] for s in spec): + print('ok any', spec, chunk_spec[dim], dim) + warnings.warn("Specified Dask chunks %r would " + "separate Zarr chunk shape %r for dimension %r. " + "This significantly degrades performance. " + "Consider rechunking after loading." + % (chunks[dim], chunk_spec[dim], dim)) + chunk_spec[dim] = chunks[dim] + return chunk_spec - def selkeys(dict_, keys): - if dict_ is None: - return None - return dict((d, dict_[d]) for d in keys if d in dict_) def maybe_chunk(name, var, chunks): from dask.base import tokenize - if chunks == 'auto': - chunks = var.encoding.get('chunks') - else: - chunks = selkeys(chunks, var.dims) + chunk_spec = get_chunk(name, var, chunks) - if (var.ndim > 0) and (chunks is not None): + if (var.ndim > 0) and (chunk_spec is not None): # does this cause any data to be read? token2 = tokenize(name, var._data) name2 = 'zarr-%s' % token2 - var = var.chunk(chunks, name=name2, lock=None) + var = var.chunk(chunk_spec, name=name2, lock=None) if overwrite_encoded_chunks and var.chunks is not None: var.encoding['chunks'] = tuple(x[0] for x in var.chunks) return var diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3dac12b5727..1b77da32528 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1446,6 +1446,30 @@ def test_manual_chunk(self): assert_identical(actual, auto) assert_identical(actual.load(), auto.load()) + + def test_warning_on_bad_chunks(self): + original = create_test_data().chunk({'dim1': 4, 'dim2': 3, 'dim3': 5}) + + bad_chunks = (2, {'dim2':(3, 3, 2, 1)}) + for chunks in bad_chunks: + kwargs = {'chunks': chunks} + with pytest.warns(UserWarning): + with self.roundtrip(original, open_kwargs=kwargs) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + + good_chunks = ({'dim2': 3}, {'dim3': 10}) + for chunks in good_chunks: + kwargs = {'chunks': chunks} + with pytest.warns(None) as record: + with self.roundtrip(original, open_kwargs=kwargs) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + assert len(record) == 0 + + def test_deprecate_auto_chunk(self): original = create_test_data().chunk() From 7618c08e42e95da6682254053c8b38d4ebe43355 Mon Sep 17 00:00:00 2001 From: Lily Date: Wed, 30 Jan 2019 11:34:50 +1100 Subject: [PATCH 08/30] fixed lingering rebase conflicts --- doc/whats-new.rst | 11 +++++------ xarray/backends/zarr.py | 26 +++++++++----------------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 86d30ce42ad..5020a975d67 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -121,6 +121,11 @@ Other enhancements By `Keisuke Fujii `_. - Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`). By `Kevin Squire `_. +- ``xr.open_zarr`` now accepts manually specified chunks with the ``chunks=`` + parameter. ``auto_chunk=True`` is equivalent to ``chunks='auto'`` for + backwards compatibility. The ``overwrite_encoded_chunks`` parameter is + added to remove the original zarr chunk encoding. + By `Lily Wang `_. Bug fixes ~~~~~~~~~ @@ -428,12 +433,6 @@ Bug fixes encoding process if a reference date is used that is so distant that the dates must be encoded using cftime rather than NumPy (:issue:`2272`). By `Spencer Clark `_. - -- ``xr.open_zarr`` now accepts manually specified chunks with the ``chunks=`` - parameter. ``auto_chunk=True`` is equivalent to ``chunks='auto'`` for - backwards compatibility. The ``overwrite_encoded_chunks`` parameter is - added to remove the original zarr chunk encoding. - By `Lily Wang `_. - Chunked datasets can now roundtrip to Zarr storage continually with `to_zarr` and ``open_zarr`` (:issue:`2300`). diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 9f10c6a3bb7..87507ece201 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,11 +1,5 @@ -<<<<<<< HEAD -from collections import OrderedDict -======= -from __future__ import absolute_import, division, print_function - import warnings - ->>>>>>> added deprecation warning +from collections import OrderedDict from distutils.version import LooseVersion import numpy as np @@ -362,8 +356,8 @@ def close(self): def open_zarr(store, group=None, synchronizer=None, chunks='auto', decode_cf=True, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, - drop_variables=None, consolidated=False, auto_chunk=True, - overwrite_encoded_chunks=False): + drop_variables=None, consolidated=False, + overwrite_encoded_chunks=False, **kwargs): """Load and decode a dataset from a Zarr store. .. note:: Experimental @@ -387,7 +381,7 @@ def open_zarr(store, group=None, synchronizer=None, chunks='auto', Chunk sizes along each dimension, e.g., ``5`` or ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created based on the variable's zarr chunks. If `chunks=None`, zarr array - data will lazily convert to numpy arrays upon access. This accepts + data will lazily convert to numpy arrays upon access. This accepts all the chunk specifications as Dask does. overwrite_encoded_chunks: bool, optional Whether to drop the zarr chunks encoded for each variable when a @@ -487,7 +481,7 @@ def maybe_decode_store(store, lock=False): # adapted from Dataset.Chunk() if isinstance(chunks, int): chunks = dict.fromkeys(ds.dims, chunks) - + if isinstance(chunks, tuple) and len(chunks) == len(ds.dims): chunks = dict(zip(ds.dims, chunks)) @@ -508,16 +502,14 @@ def get_chunk(name, var, chunks): spec = (spec,) if isinstance(spec, (tuple, list)) and chunk_spec[dim]: if any(s % chunk_spec[dim] for s in spec): - print('ok any', spec, chunk_spec[dim], dim) warnings.warn("Specified Dask chunks %r would " - "separate Zarr chunk shape %r for dimension %r. " - "This significantly degrades performance. " - "Consider rechunking after loading." - % (chunks[dim], chunk_spec[dim], dim)) + "separate Zarr chunk shape %r for dimension %r. " + "This significantly degrades performance. " + "Consider rechunking after loading." + % (chunks[dim], chunk_spec[dim], dim)) chunk_spec[dim] = chunks[dim] return chunk_spec - def maybe_chunk(name, var, chunks): from dask.base import tokenize From 8571131e78197d1fc0101c2be849527f9a883607 Mon Sep 17 00:00:00 2001 From: Lily Date: Wed, 30 Jan 2019 11:38:14 +1100 Subject: [PATCH 09/30] fixed pep8 issues --- xarray/backends/zarr.py | 9 +++++---- xarray/tests/test_backends.py | 7 ++----- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 87507ece201..d0696f20499 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -503,10 +503,11 @@ def get_chunk(name, var, chunks): if isinstance(spec, (tuple, list)) and chunk_spec[dim]: if any(s % chunk_spec[dim] for s in spec): warnings.warn("Specified Dask chunks %r would " - "separate Zarr chunk shape %r for dimension %r. " - "This significantly degrades performance. " - "Consider rechunking after loading." - % (chunks[dim], chunk_spec[dim], dim)) + "separate Zarr chunk shape %r for " + "dimension %r. This significantly " + "degrades performance. Consider " + "rechunking after loading instead." + % (chunks[dim], chunk_spec[dim], dim)) chunk_spec[dim] = chunks[dim] return chunk_spec diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 1b77da32528..5efcdf9cd98 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1446,11 +1446,11 @@ def test_manual_chunk(self): assert_identical(actual, auto) assert_identical(actual.load(), auto.load()) - + def test_warning_on_bad_chunks(self): original = create_test_data().chunk({'dim1': 4, 'dim2': 3, 'dim3': 5}) - bad_chunks = (2, {'dim2':(3, 3, 2, 1)}) + bad_chunks = (2, {'dim2': (3, 3, 2, 1)}) for chunks in bad_chunks: kwargs = {'chunks': chunks} with pytest.warns(UserWarning): @@ -1469,8 +1469,6 @@ def test_warning_on_bad_chunks(self): assert v._in_memory == (k in actual.dims) assert len(record) == 0 - - def test_deprecate_auto_chunk(self): original = create_test_data().chunk() with pytest.warns(FutureWarning): @@ -1491,7 +1489,6 @@ def test_deprecate_auto_chunk(self): # there should be no chunks assert v.chunks is None - def test_write_uneven_dask_chunks(self): # regression for GH#2225 original = create_test_data().chunk({'dim1': 3, 'dim2': 4, 'dim3': 3}) From a70205a4af1e51fa5534db72c18e33321ce7c9b8 Mon Sep 17 00:00:00 2001 From: Lily Date: Thu, 4 Apr 2019 12:39:26 +1100 Subject: [PATCH 10/30] added stacklevel --- xarray/backends/zarr.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index d0696f20499..e20140ee248 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -212,7 +212,7 @@ def encode_zarr_variable(var, needs_copy=True, name=None): # zarr allows unicode, but not variable-length strings, so it's both # simpler and more compact to always encode as UTF-8 explicitly. # TODO: allow toggling this explicitly via dtype in encoding. - coder = coding.strings.EncodedStringCoder(allows_unicode=False) + coder = coding.strings.EncodedStringCoder(allows_unicode=True) var = coder.encode(var, name=name) var = coding.strings.ensure_fixed_length_bytes(var) @@ -507,7 +507,8 @@ def get_chunk(name, var, chunks): "dimension %r. This significantly " "degrades performance. Consider " "rechunking after loading instead." - % (chunks[dim], chunk_spec[dim], dim)) + % (chunks[dim], chunk_spec[dim], dim), + stacklevel=2) chunk_spec[dim] = chunks[dim] return chunk_spec From 17fa557d9ada9e60971ce0a7370fd969aa788946 Mon Sep 17 00:00:00 2001 From: Lily Date: Thu, 4 Apr 2019 12:44:45 +1100 Subject: [PATCH 11/30] fixed pep8 issues --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5efcdf9cd98..bf40e529931 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1437,7 +1437,7 @@ def test_manual_chunk(self): open_kwargs = {'chunks': chunks, 'overwrite_encoded_chunks': True} with self.roundtrip(original, open_kwargs=open_kwargs) as actual: for k, v in actual.variables.items(): - assert v.chunks == rechunked[k].chunks + assert v.chunks == rechunked[k].chunks with self.roundtrip(actual) as auto: # encoding should have changed From 31619d711b952a30c8cfe56916c8720359835827 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 4 Apr 2019 14:58:23 -0700 Subject: [PATCH 12/30] Various fixes for explicit Dataset.indexes (#2858) * Various fixes for explicit Dataset.indexes Fixes GH2856 I've added internal consistency checks to the uses of ``assert_equal`` in our test suite, so this shouldn't happen again. * Fix indexes in Dataset.interp --- doc/whats-new.rst | 4 +- xarray/core/alignment.py | 63 +++++++++++------------ xarray/core/dataarray.py | 3 -- xarray/core/dataset.py | 48 +++++++++++------- xarray/core/indexes.py | 7 +-- xarray/testing.py | 43 ++++++++++++++-- xarray/tests/__init__.py | 25 +++++++++- xarray/tests/test_combine.py | 9 +++- xarray/tests/test_dataset.py | 97 ++++++++++++++++++++++++------------ xarray/tests/test_interp.py | 2 +- 10 files changed, 202 insertions(+), 99 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9c88445b5ba..6fc7c25ac91 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -31,7 +31,9 @@ Bug fixes - Dataset.copy(deep=True) now creates a deep copy of the attrs (:issue:`2835`). By `Andras Gefferth `_. -- ``swap_dims`` would create incorrect ``indexes`` (:issue:`2842`). +- Fix incorrect ``indexes`` resulting from various ``Dataset`` operations + (e.g., ``swap_dims``, ``isel``, ``reindex``, ``[]``) (:issue:`2842`, + :issue:`2856`). By `Stephan Hoyer `_. .. _whats-new.0.12.0: diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index af08eef268f..642be735e9b 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -315,36 +315,51 @@ def reindex_variables( """ from .dataarray import DataArray + # create variables for the new dataset + reindexed = OrderedDict() # type: OrderedDict[Any, Variable] + # build up indexers for assignment along each dimension int_indexers = {} - targets = OrderedDict() # type: OrderedDict[Any, pd.Index] + new_indexes = OrderedDict(indexes) masked_dims = set() unchanged_dims = set() - # size of reindexed dimensions - new_sizes = {} + for dim, indexer in indexers.items(): + if isinstance(indexer, DataArray) and indexer.dims != (dim,): + warnings.warn( + "Indexer has dimensions {0:s} that are different " + "from that to be indexed along {1:s}. " + "This will behave differently in the future.".format( + str(indexer.dims), dim), + FutureWarning, stacklevel=3) + + target = new_indexes[dim] = utils.safe_cast_to_index(indexers[dim]) + + if dim in indexes: + index = indexes[dim] - for name, index in indexes.items(): - if name in indexers: if not index.is_unique: raise ValueError( 'cannot reindex or align along dimension %r because the ' - 'index has duplicate values' % name) - - target = utils.safe_cast_to_index(indexers[name]) - new_sizes[name] = len(target) + 'index has duplicate values' % dim) int_indexer = get_indexer_nd(index, target, method, tolerance) # We uses negative values from get_indexer_nd to signify # values that are missing in the index. if (int_indexer < 0).any(): - masked_dims.add(name) + masked_dims.add(dim) elif np.array_equal(int_indexer, np.arange(len(index))): - unchanged_dims.add(name) + unchanged_dims.add(dim) - int_indexers[name] = int_indexer - targets[name] = target + int_indexers[dim] = int_indexer + + if dim in variables: + var = variables[dim] + args = (var.attrs, var.encoding) # type: tuple + else: + args = () + reindexed[dim] = IndexVariable((dim,), target, *args) for dim in sizes: if dim not in indexes and dim in indexers: @@ -356,25 +371,6 @@ def reindex_variables( 'index because its size %r is different from the size of ' 'the new index %r' % (dim, existing_size, new_size)) - # create variables for the new dataset - reindexed = OrderedDict() # type: OrderedDict[Any, Variable] - - for dim, indexer in indexers.items(): - if isinstance(indexer, DataArray) and indexer.dims != (dim,): - warnings.warn( - "Indexer has dimensions {0:s} that are different " - "from that to be indexed along {1:s}. " - "This will behave differently in the future.".format( - str(indexer.dims), dim), - FutureWarning, stacklevel=3) - - if dim in variables: - var = variables[dim] - args = (var.attrs, var.encoding) # type: tuple - else: - args = () - reindexed[dim] = IndexVariable((dim,), indexers[dim], *args) - for name, var in variables.items(): if name not in indexers: key = tuple(slice(None) @@ -395,9 +391,6 @@ def reindex_variables( reindexed[name] = new_var - new_indexes = OrderedDict(indexes) - new_indexes.update(targets) - return reindexed, new_indexes diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index c24703f5384..a9e55159f57 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -231,9 +231,6 @@ def __init__(self, data, coords=None, dims=None, name=None, coords, dims = _infer_coords_and_dims(data.shape, coords, dims) variable = Variable(dims, data, attrs, encoding, fastpath=True) - # uncomment for a useful consistency check: - # assert all(isinstance(v, Variable) for v in coords.values()) - # These fully describe a DataArray self._variable = variable self._coords = coords diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index e3b2e3c3d2c..cf6631fa5ba 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -938,6 +938,7 @@ def _copy_listed(self: T, names) -> T: """ variables = OrderedDict() # type: OrderedDict[Any, Variable] coord_names = set() + indexes = OrderedDict() # type: OrderedDict[Any, pd.Index] for name in names: try: @@ -948,6 +949,8 @@ def _copy_listed(self: T, names) -> T: variables[var_name] = var if ref_name in self._coord_names or ref_name in self.dims: coord_names.add(var_name) + if (var_name,) == var.dims: + indexes[var_name] = var.to_index() needed_dims = set() # type: set for v in variables.values(): @@ -959,12 +962,8 @@ def _copy_listed(self: T, names) -> T: if set(self.variables[k].dims) <= needed_dims: variables[k] = self._variables[k] coord_names.add(k) - - if self._indexes is None: - indexes = None - else: - indexes = OrderedDict((k, v) for k, v in self._indexes.items() - if k in coord_names) + if k in self.indexes: + indexes[k] = self.indexes[k] return self._replace(variables, coord_names, dims, indexes=indexes) @@ -1503,9 +1502,13 @@ def _validate_indexers( raise ValueError("dimensions %r do not exist" % invalid) # all indexers should be int, slice, np.ndarrays, or Variable - indexers_list = [] + indexers_list = [] # type: List[Tuple[Any, Union[slice, Variable]]] for k, v in indexers.items(): - if isinstance(v, (slice, Variable)): + if isinstance(v, slice): + indexers_list.append((k, v)) + continue + + if isinstance(v, Variable): pass elif isinstance(v, DataArray): v = v.variable @@ -1524,14 +1527,19 @@ def _validate_indexers( v = _parse_array_of_cftime_strings(v, index.date_type) if v.ndim == 0: - v = as_variable(v) + v = Variable((), v) elif v.ndim == 1: - v = as_variable((k, v)) + v = IndexVariable((k,), v) else: raise IndexError( "Unlabeled multi-dimensional array cannot be " "used for indexing: {}".format(k)) + + if v.ndim == 1: + v = v.to_index_variable() + indexers_list.append((k, v)) + return indexers_list def _get_indexers_coords_and_indexes(self, indexers): @@ -1631,7 +1639,7 @@ def isel(self, indexers=None, drop=False, **indexers_kwargs): if name in self.indexes: new_var, new_index = isel_variable_and_index( - var, self.indexes[name], var_indexers) + name, var, self.indexes[name], var_indexers) if new_index is not None: indexes[name] = new_index else: @@ -2117,15 +2125,20 @@ def _validate_interp_indexer(x, new_x): indexes = OrderedDict( (k, v) for k, v in obj.indexes.items() if k not in indexers) selected = self._replace_with_new_dims( - variables, coord_names, indexes=indexes) + variables.copy(), coord_names, indexes=indexes) # attach indexer as coordinate variables.update(indexers) + indexes.update( + (k, v.to_index()) for k, v in indexers.items() if v.dims == (k,) + ) + # Extract coordinates from indexers coord_vars, new_indexes = ( selected._get_indexers_coords_and_indexes(coords)) variables.update(coord_vars) indexes.update(new_indexes) + coord_names = (set(variables) .intersection(obj._coord_names) .union(coord_vars)) @@ -2401,6 +2414,7 @@ def expand_dims(self, dim=None, axis=None, **dim_kwargs): ' variable name.'.format(dim=d)) variables = OrderedDict() + coord_names = self._coord_names.copy() # If dim is a dict, then ensure that the values are either integers # or iterables. for k, v in dim.items(): @@ -2410,7 +2424,7 @@ def expand_dims(self, dim=None, axis=None, **dim_kwargs): # value within the dim dict to the length of the iterable # for later use. variables[k] = xr.IndexVariable((k,), v) - self._coord_names.add(k) + coord_names.add(k) dim[k] = variables[k].size elif isinstance(v, int): pass # Do nothing if the dimensions value is just an int @@ -2420,7 +2434,7 @@ def expand_dims(self, dim=None, axis=None, **dim_kwargs): for k, v in self._variables.items(): if k not in dim: - if k in self._coord_names: # Do not change coordinates + if k in coord_names: # Do not change coordinates variables[k] = v else: result_ndim = len(v.dims) + len(axis) @@ -2452,10 +2466,10 @@ def expand_dims(self, dim=None, axis=None, **dim_kwargs): variables[k] = v.set_dims(k) new_dims = self._dims.copy() - for d in dim: - new_dims[d] = 1 + new_dims.update(dim) - return self._replace(variables, dims=new_dims) + return self._replace_vars_and_dims( + variables, dims=new_dims, coord_names=coord_names) def set_index(self, indexes=None, append=False, inplace=None, **indexes_kwargs): diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 6d8b553036a..eccb72b6a58 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -1,6 +1,6 @@ import collections.abc from collections import OrderedDict -from typing import Any, Iterable, Mapping, Optional, Tuple, Union +from typing import Any, Hashable, Iterable, Mapping, Optional, Tuple, Union import pandas as pd @@ -59,6 +59,7 @@ def default_indexes( def isel_variable_and_index( + name: Hashable, variable: Variable, index: pd.Index, indexers: Mapping[Any, Union[slice, Variable]], @@ -75,8 +76,8 @@ def isel_variable_and_index( new_variable = variable.isel(indexers) - if new_variable.ndim != 1: - # can't preserve a index if result is not 0D + if new_variable.dims != (name,): + # can't preserve a index if result has new dimensions return new_variable, None # we need to compute the new index diff --git a/xarray/testing.py b/xarray/testing.py index 794c0614925..eb8a0e8603d 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -1,8 +1,12 @@ """Testing functions exposed to the user API""" +from collections import OrderedDict + import numpy as np +import pandas as pd from xarray.core import duck_array_ops from xarray.core import formatting +from xarray.core.indexes import default_indexes def _decode_string_data(data): @@ -143,8 +147,37 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True): .format(type(a))) -def assert_combined_tile_ids_equal(dict1, dict2): - assert len(dict1) == len(dict2) - for k, v in dict1.items(): - assert k in dict2.keys() - assert_equal(dict1[k], dict2[k]) +def _assert_indexes_invariants_checks(indexes, possible_coord_variables, dims): + import xarray as xr + + assert isinstance(indexes, OrderedDict), indexes + assert all(isinstance(v, pd.Index) for v in indexes.values()), \ + {k: type(v) for k, v in indexes.items()} + + index_vars = {k for k, v in possible_coord_variables.items() + if isinstance(v, xr.IndexVariable)} + assert indexes.keys() <= index_vars, (set(indexes), index_vars) + + # Note: when we support non-default indexes, these checks should be opt-in + # only! + defaults = default_indexes(possible_coord_variables, dims) + assert indexes.keys() == defaults.keys(), \ + (set(indexes), set(defaults)) + assert all(v.equals(defaults[k]) for k, v in indexes.items()), \ + (indexes, defaults) + + +def _assert_indexes_invariants(a): + """Separate helper function for checking indexes invariants only.""" + import xarray as xr + + if isinstance(a, xr.DataArray): + if a._indexes is not None: + _assert_indexes_invariants_checks(a._indexes, a._coords, a.dims) + elif isinstance(a, xr.Dataset): + if a._indexes is not None: + _assert_indexes_invariants_checks( + a._indexes, a._variables, a._dims) + elif isinstance(a, xr.Variable): + # no indexes + pass diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 4ebcc29a61e..525360701fe 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -13,8 +13,7 @@ from xarray.core import utils from xarray.core.options import set_options from xarray.core.indexing import ExplicitlyIndexed -from xarray.testing import (assert_equal, assert_identical, # noqa: F401 - assert_allclose, assert_combined_tile_ids_equal) +import xarray.testing from xarray.plot.utils import import_seaborn try: @@ -180,3 +179,25 @@ def source_ndarray(array): if base is None: base = array return base + + +# Internal versions of xarray's test functions that validate additional +# invariants +# TODO: add more invariant checks. + +def assert_equal(a, b): + xarray.testing.assert_equal(a, b) + xarray.testing._assert_indexes_invariants(a) + xarray.testing._assert_indexes_invariants(b) + + +def assert_identical(a, b): + xarray.testing.assert_identical(a, b) + xarray.testing._assert_indexes_invariants(a) + xarray.testing._assert_indexes_invariants(b) + + +def assert_allclose(a, b, **kwargs): + xarray.testing.assert_allclose(a, b, **kwargs) + xarray.testing._assert_indexes_invariants(a) + xarray.testing._assert_indexes_invariants(b) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 0d03b6e0cdf..6d0f4626086 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -13,7 +13,7 @@ _infer_tile_ids_from_nested_list, _new_tile_id) from . import ( - InaccessibleArray, assert_array_equal, assert_combined_tile_ids_equal, + InaccessibleArray, assert_array_equal, assert_equal, assert_identical, raises_regex, requires_dask) from .test_dataset import create_test_data @@ -418,6 +418,13 @@ def test_auto_combine_no_concat(self): assert_identical(expected, actual) +def assert_combined_tile_ids_equal(dict1, dict2): + assert len(dict1) == len(dict2) + for k, v in dict1.items(): + assert k in dict2.keys() + assert_equal(dict1[k], dict2[k]) + + class TestTileIDsFromNestedList(object): def test_1d(self): ds = create_test_data diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index ab83d385ef4..3ace80f5eea 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2053,12 +2053,8 @@ def test_expand_dims_error(self): if python36_plus: with raises_regex(ValueError, 'both keyword and positional'): original.expand_dims(OrderedDict((("d", 4),)), e=4) - else: - # In python 3.5, using dim_kwargs should raise a ValueError. - with raises_regex(ValueError, "dim_kwargs isn't"): - original.expand_dims(OrderedDict((("d", 4),)), e=4) - def test_expand_dims(self): + def test_expand_dims_int(self): original = Dataset({'x': ('a', np.random.randn(3)), 'y': (['b', 'a'], np.random.randn(4, 3))}, coords={'a': np.linspace(0, 1, 3), @@ -2091,9 +2087,37 @@ def test_expand_dims(self): roundtripped = actual.squeeze('z') assert_identical(original, roundtripped) + def test_expand_dims_coords(self): + original = Dataset({'x': ('a', np.array([1, 2, 3]))}) + expected = Dataset( + {'x': (('b', 'a'), np.array([[1, 2, 3], [1, 2, 3]]))}, + coords={'b': [1, 2]}, + ) + actual = original.expand_dims(OrderedDict(b=[1, 2])) + assert_identical(expected, actual) + assert 'b' not in original._coord_names + + def test_expand_dims_existing_scalar_coord(self): + original = Dataset({'x': 1}, {'a': 2}) + expected = Dataset({'x': (('a',), [1])}, {'a': [2]}) + actual = original.expand_dims('a') + assert_identical(expected, actual) + + def test_isel_expand_dims_roundtrip(self): + original = Dataset({'x': (('a',), [1])}, {'a': [2]}) + actual = original.isel(a=0).expand_dims('a') + assert_identical(actual, original) + + def test_expand_dims_mixed_int_and_coords(self): # Test expanding one dimension to have size > 1 that doesn't have # coordinates, and also expanding another dimension to have size > 1 # that DOES have coordinates. + original = Dataset({'x': ('a', np.random.randn(3)), + 'y': (['b', 'a'], np.random.randn(4, 3))}, + coords={'a': np.linspace(0, 1, 3), + 'b': np.linspace(0, 1, 4), + 'c': np.linspace(0, 1, 5)}) + actual = original.expand_dims( OrderedDict((("d", 4), ("e", ["l", "m", "n"])))) @@ -2109,34 +2133,45 @@ def test_expand_dims(self): b=np.linspace(0, 1, 4), a=np.linspace(0, 1, 3)), dims=['d', 'e', 'b', 'a']).drop('d')}, - coords={'c': np.linspace(0, 1, 5)}, - attrs={'key': 'entry'}) + coords={'c': np.linspace(0, 1, 5)}) assert_identical(actual, expected) - # Test with kwargs instead of passing dict to dim arg. - - # TODO: only the code under the if-statement is needed when python 3.5 - # is no longer supported. - python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 - if python36_plus: - other_way = original.expand_dims(e=["l", "m", "n"]) - other_way_expected = Dataset( - {'x': xr.DataArray(original['x'].values * np.ones([3, 3]), - coords=dict(e=['l', 'm', 'n'], - a=np.linspace(0, 1, 3)), - dims=['e', 'a']), - 'y': xr.DataArray(original['y'].values * np.ones([3, 4, 3]), - coords=dict(e=['l', 'm', 'n'], - b=np.linspace(0, 1, 4), - a=np.linspace(0, 1, 3)), - dims=['e', 'b', 'a'])}, - coords={'c': np.linspace(0, 1, 5)}, - attrs={'key': 'entry'}) - assert_identical(other_way_expected, other_way) - else: - # In python 3.5, using dim_kwargs should raise a ValueError. - with raises_regex(ValueError, "dim_kwargs isn't"): - original.expand_dims(e=["l", "m", "n"]) + @pytest.mark.skipif( + sys.version_info[:2] > (3, 5), + reason="we only raise these errors for Python 3.5", + ) + def test_expand_dims_kwargs_python35(self): + original = Dataset({'x': ('a', np.random.randn(3))}) + with raises_regex(ValueError, "dim_kwargs isn't"): + original.expand_dims(e=["l", "m", "n"]) + with raises_regex(TypeError, "must be an OrderedDict"): + original.expand_dims({'e': ["l", "m", "n"]}) + + @pytest.mark.skipif( + sys.version_info[:2] < (3, 6), + reason='keyword arguments are only ordered on Python 3.6+', + ) + def test_expand_dims_kwargs_python36plus(self): + original = Dataset({'x': ('a', np.random.randn(3)), + 'y': (['b', 'a'], np.random.randn(4, 3))}, + coords={'a': np.linspace(0, 1, 3), + 'b': np.linspace(0, 1, 4), + 'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + other_way = original.expand_dims(e=["l", "m", "n"]) + other_way_expected = Dataset( + {'x': xr.DataArray(original['x'].values * np.ones([3, 3]), + coords=dict(e=['l', 'm', 'n'], + a=np.linspace(0, 1, 3)), + dims=['e', 'a']), + 'y': xr.DataArray(original['y'].values * np.ones([3, 4, 3]), + coords=dict(e=['l', 'm', 'n'], + b=np.linspace(0, 1, 4), + a=np.linspace(0, 1, 3)), + dims=['e', 'b', 'a'])}, + coords={'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + assert_identical(other_way_expected, other_way) def test_set_index(self): expected = create_test_multiindex() diff --git a/xarray/tests/test_interp.py b/xarray/tests/test_interp.py index 5596bfb3bfb..8347d54bd1e 100644 --- a/xarray/tests/test_interp.py +++ b/xarray/tests/test_interp.py @@ -291,7 +291,7 @@ def test_errors(use_dask): if use_dask: da = get_example_data(3) else: - da = get_example_data(1) + da = get_example_data(0) result = da.interp(x=[-1, 1, 3], kwargs={'fill_value': 0.0}) assert not np.isnan(result.values).any() From aa6abb592ac2464170459ca96409398ec8b4593a Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 4 Apr 2019 18:31:26 -0700 Subject: [PATCH 13/30] 0.12.1 release --- doc/whats-new.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6fc7c25ac91..76e51b21984 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -15,8 +15,8 @@ What's New .. _whats-new.0.12.1: -v0.12.1 (unreleased) --------------------- +v0.12.1 (4 April 2019) +---------------------- Enhancements ~~~~~~~~~~~~ @@ -25,7 +25,6 @@ Enhancements with size > 1. (:issue:`2710`) By `Martin Pletcher `_. - Bug fixes ~~~~~~~~~ From 23d54a890e6cfe1de420071b597c911164de4cb8 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 4 Apr 2019 18:34:37 -0700 Subject: [PATCH 14/30] revert to 0.12.2 dev --- doc/whats-new.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 76e51b21984..4c126196469 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -13,6 +13,17 @@ What's New import xarray as xr np.random.seed(123456) +.. _whats-new.0.12.2: + +v0.12.2 (unreleased) +-------------------- + +Enhancements +~~~~~~~~~~~~ + +Bug fixes +~~~~~~~~~ + .. _whats-new.0.12.1: v0.12.1 (4 April 2019) From e7ec0870b15114b0443c28bf7e32d42717808c98 Mon Sep 17 00:00:00 2001 From: Adam Leskis Date: Sun, 7 Apr 2019 20:55:07 +0100 Subject: [PATCH 15/30] update links to https (#2872) --- README.rst | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/README.rst b/README.rst index 6dbf774549d..83382f87ed5 100644 --- a/README.rst +++ b/README.rst @@ -8,9 +8,9 @@ xarray: N-D labeled arrays and datasets .. image:: https://coveralls.io/repos/pydata/xarray/badge.svg :target: https://coveralls.io/r/pydata/xarray .. image:: https://readthedocs.org/projects/xray/badge/?version=latest - :target: http://xarray.pydata.org/ -.. image:: http://img.shields.io/badge/benchmarked%20by-asv-green.svg?style=flat - :target: http://pandas.pydata.org/speed/xarray/ + :target: https://xarray.pydata.org/ +.. image:: https://img.shields.io/badge/benchmarked%20by-asv-green.svg?style=flat + :target: https://pandas.pydata.org/speed/xarray/ .. image:: https://img.shields.io/pypi/v/xarray.svg :target: https://pypi.python.org/pypi/xarray/ @@ -30,10 +30,10 @@ It is particularly tailored to working with netCDF_ files, which were the source of xarray's data model, and integrates tightly with dask_ for parallel computing. -.. _NumPy: http://www.numpy.org -.. _pandas: http://pandas.pydata.org -.. _dask: http://dask.org -.. _netCDF: http://www.unidata.ucar.edu/software/netcdf +.. _NumPy: https://www.numpy.org +.. _pandas: https://pandas.pydata.org +.. _dask: https://dask.org +.. _netCDF: https://www.unidata.ucar.edu/software/netcdf Why xarray? ----------- @@ -66,12 +66,12 @@ powerful and concise interface. For example: Documentation ------------- -Learn more about xarray in its official documentation at http://xarray.pydata.org/ +Learn more about xarray in its official documentation at https://xarray.pydata.org/ Contributing ------------ -You can find information about contributing to xarray at our `Contributing page `_. +You can find information about contributing to xarray at our `Contributing page `_. Get in touch ------------ @@ -81,9 +81,9 @@ Get in touch - For less well defined questions or ideas, or to announce other projects of interest to xarray users, use the `mailing list`_. -.. _StackOverFlow: http://stackoverflow.com/questions/tagged/python-xarray +.. _StackOverFlow: https://stackoverflow.com/questions/tagged/python-xarray .. _mailing list: https://groups.google.com/forum/#!forum/xarray -.. _on GitHub: http://github.com/pydata/xarray +.. _on GitHub: https://github.com/pydata/xarray NumFOCUS -------- @@ -120,7 +120,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + https://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, From 3435b03de218f54a55eb72dff597bb47b0f407cb Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 7 Apr 2019 23:42:30 -0700 Subject: [PATCH 16/30] Fix mypy typing error in cftime_offsets.py (#2878) --- xarray/coding/cftime_offsets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index d724554b458..2ee38a20a4d 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -79,7 +79,7 @@ def get_date_type(calendar): class BaseCFTimeOffset(object): _freq = None # type: ClassVar[str] - _day_option = None + _day_option = None # type: ClassVar[str] def __init__(self, n=1): if not isinstance(n, int): From 2c10d1443bea09e5ef53e5a7e35195a195e193a7 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 9 Apr 2019 19:34:21 -0400 Subject: [PATCH 17/30] decreased pytest verbosity (#2881) --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index ea9ee7adcf4..212ddb77daa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -65,7 +65,7 @@ script: elif [[ "$CONDA_ENV" == "py36-hypothesis" ]]; then pytest properties ; else - py.test xarray --cov=xarray --cov-config ci/.coveragerc --cov-report term-missing --verbose $EXTRA_FLAGS; + py.test xarray --cov=xarray --cov-config ci/.coveragerc --cov-report term-missing $EXTRA_FLAGS; fi after_success: From f063f55f6ba31e8d871f9163570d94d256e72daa Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Thu, 1 Nov 2018 05:52:13 +1100 Subject: [PATCH 18/30] added manual chunks for open_zarr --- xarray/backends/zarr.py | 78 +++++++++++++++++++++++++---------- xarray/tests/test_backends.py | 46 +++++++++++++++++++++ 2 files changed, 103 insertions(+), 21 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index ee77e0833c4..a5ee9867f8e 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -352,10 +352,11 @@ def close(self): zarr.consolidate_metadata(self.ds.store) -def open_zarr(store, group=None, synchronizer=None, auto_chunk=True, +def open_zarr(store, group=None, synchronizer=None, chunks=None, decode_cf=True, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, - drop_variables=None, consolidated=False): + drop_variables=None, consolidated=False, auto_chunk=True, + overwrite_encoded_chunks=False): """Load and decode a dataset from a Zarr store. .. note:: Experimental @@ -375,10 +376,19 @@ def open_zarr(store, group=None, synchronizer=None, auto_chunk=True, Array synchronizer provided to zarr group : str, obtional Group path. (a.k.a. `path` in zarr terminology.) + chunks : int or dict or {None, 'auto'}, optional + Chunk sizes along each dimension, e.g., ``5`` or + ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created + based on the variable's zarr chunks. If `chunks=None` and + `auto_chunk=False`, zarr array data will lazily convert to numpy + arrays upon access. auto_chunk : bool, optional Whether to automatically create dask chunks corresponding to each - variable's zarr chunks. If False, zarr array data will lazily convert - to numpy arrays upon access. + variable's zarr chunks. If `chunks=None`, this overrides `chunks`. + Equivalent to `chunks='auto'.` (Default: True) + overwrite_encoded_chunks: bool, optional + Whether to drop the zarr chunks encoded for each variable when a + dataset is loaded with specified chunk sizes (default: False) decode_cf : bool, optional Whether to decode these variables, assuming they were saved according to CF conventions. @@ -423,6 +433,15 @@ def open_zarr(store, group=None, synchronizer=None, auto_chunk=True, http://zarr.readthedocs.io/ """ + if auto_chunk and chunks is None: + chunks = 'auto' # maintain backwards compatibility + + if not isinstance(chunks, (int, dict)): + if chunks != 'auto' and chunks is not None: + raise ValueError( + "chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks) + if not decode_cf: mask_and_scale = False decode_times = False @@ -449,21 +468,38 @@ def maybe_decode_store(store, lock=False): # auto chunking needs to be here and not in ZarrStore because variable # chunks do not survive decode_cf - if auto_chunk: - # adapted from Dataset.Chunk() - def maybe_chunk(name, var): - from dask.base import tokenize - chunks = var.encoding.get('chunks') - if (var.ndim > 0) and (chunks is not None): - # does this cause any data to be read? - token2 = tokenize(name, var._data) - name2 = 'zarr-%s' % token2 - return var.chunk(chunks, name=name2, lock=None) - else: - return var - - variables = OrderedDict([(k, maybe_chunk(k, v)) - for k, v in ds.variables.items()]) - return ds._replace_vars_and_dims(variables) - else: + # return trivial case + if not chunks: return ds + + # adapted from Dataset.Chunk() + if isinstance(chunks, int): + chunks = dict.fromkeys(ds.dims, chunks) + + def selkeys(dict_, keys): + if dict_ is None: + return None + return dict((d, dict_[d]) for d in keys if d in dict_) + + def maybe_chunk(name, var, chunks): + from dask.base import tokenize + + if chunks == 'auto': + chunks = var.encoding.get('chunks') + else: + chunks = selkeys(chunks, var.dims) + + if (var.ndim > 0) and (chunks is not None): + # does this cause any data to be read? + token2 = tokenize(name, var._data) + name2 = 'zarr-%s' % token2 + var = var.chunk(chunks, name=name2, lock=None) + if overwrite_encoded_chunks and var.chunks is not None: + var.encoding['chunks'] = tuple(x[0] for x in var.chunks) + return var + else: + return var + + variables = OrderedDict([(k, maybe_chunk(k, v, chunks)) + for k, v in ds.variables.items()]) + return ds._replace_vars_and_dims(variables) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a20ba2df229..5f21027962f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1405,6 +1405,52 @@ def test_auto_chunk(self): assert v._in_memory == (k in actual.dims) # chunk size should be the same as original assert v.chunks == original[k].chunks + + def test_manual_chunk(self): + original = create_test_data().chunk({'dim1': 3, 'dim2': 4, 'dim3': 3}) + + # All of these should return non-chunked arrays + NO_CHUNKS = (None, 0, {}) + for no_chunk in NO_CHUNKS: + with self.roundtrip( + original, open_kwargs={'chunks': no_chunk, + 'auto_chunk': False}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + # there should be no chunks + assert v.chunks is None + + # uniform arrays + for i in range(2, 6): + rechunked = original.chunk(chunks=i) + + with self.roundtrip( + original, open_kwargs={'chunks': i}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + # chunk size should be the same as rechunked + assert v.chunks == rechunked[k].chunks + + chunks = {'dim1': 2, 'dim2': 3, 'dim3': 5} + rechunked = original.chunk(chunks=chunks) + + open_overwritten = {'chunks': chunks, + 'overwrite_encoded_chunks': True} + + with self.roundtrip( + original, open_kwargs=open_overwritten) as actual: + for k, v in actual.variables.items(): + assert v.chunks == rechunked[k].chunks + + with self.roundtrip(actual, open_kwargs={'chunks': 'auto'}) as auto: + # encoding should have changed + for k, v in actual.variables.items(): + assert v.chunks == rechunked[k].chunks + + assert_identical(actual, auto) + assert_identical(actual.load(), auto.load()) def test_write_uneven_dask_chunks(self): # regression for GH#2225 From c02a1c7f4dd6e686042d7622fe6dbdd40e0917ef Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Thu, 1 Nov 2018 05:53:03 +1100 Subject: [PATCH 19/30] updated whats-new --- doc/whats-new.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4c126196469..5d32fffc53a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -440,6 +440,12 @@ Bug fixes encoding process if a reference date is used that is so distant that the dates must be encoded using cftime rather than NumPy (:issue:`2272`). By `Spencer Clark `_. + +- ``xr.open_zarr`` now accepts manually specified chunks with the ``chunks=`` + parameter. ``auto_chunk=True`` is equivalent to ``chunks='auto'`` for + backwards compatibility. The ``overwrite_encoded_chunks`` parameter is + added to remove the original zarr chunk encoding. + By `Lily Wang `_. - Chunked datasets can now roundtrip to Zarr storage continually with `to_zarr` and ``open_zarr`` (:issue:`2300`). From c361f705a707bc7df3ad6558592e165a62d11479 Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Thu, 1 Nov 2018 06:14:23 +1100 Subject: [PATCH 20/30] fixed pep8 issues --- xarray/backends/zarr.py | 23 +++++++++++------------ xarray/tests/test_backends.py | 25 ++++++++++--------------- 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index a5ee9867f8e..89f55400e82 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -378,16 +378,16 @@ def open_zarr(store, group=None, synchronizer=None, chunks=None, Group path. (a.k.a. `path` in zarr terminology.) chunks : int or dict or {None, 'auto'}, optional Chunk sizes along each dimension, e.g., ``5`` or - ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created - based on the variable's zarr chunks. If `chunks=None` and - `auto_chunk=False`, zarr array data will lazily convert to numpy - arrays upon access. + ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created + based on the variable's zarr chunks. If `chunks=None` and + `auto_chunk=False`, zarr array data will lazily convert to numpy + arrays upon access. auto_chunk : bool, optional Whether to automatically create dask chunks corresponding to each - variable's zarr chunks. If `chunks=None`, this overrides `chunks`. + variable's zarr chunks. If `chunks=None`, this overrides `chunks`. Equivalent to `chunks='auto'.` (Default: True) overwrite_encoded_chunks: bool, optional - Whether to drop the zarr chunks encoded for each variable when a + Whether to drop the zarr chunks encoded for each variable when a dataset is loaded with specified chunk sizes (default: False) decode_cf : bool, optional Whether to decode these variables, assuming they were saved according @@ -434,12 +434,11 @@ def open_zarr(store, group=None, synchronizer=None, chunks=None, """ if auto_chunk and chunks is None: - chunks = 'auto' # maintain backwards compatibility + chunks = 'auto' # maintain backwards compatibility if not isinstance(chunks, (int, dict)): if chunks != 'auto' and chunks is not None: - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " + raise ValueError("chunks must be an int, dict, 'auto', or None. " "Instead found %s. " % chunks) if not decode_cf: @@ -471,7 +470,7 @@ def maybe_decode_store(store, lock=False): # return trivial case if not chunks: return ds - + # adapted from Dataset.Chunk() if isinstance(chunks, int): chunks = dict.fromkeys(ds.dims, chunks) @@ -488,7 +487,7 @@ def maybe_chunk(name, var, chunks): chunks = var.encoding.get('chunks') else: chunks = selkeys(chunks, var.dims) - + if (var.ndim > 0) and (chunks is not None): # does this cause any data to be read? token2 = tokenize(name, var._data) @@ -501,5 +500,5 @@ def maybe_chunk(name, var, chunks): return var variables = OrderedDict([(k, maybe_chunk(k, v, chunks)) - for k, v in ds.variables.items()]) + for k, v in ds.variables.items()]) return ds._replace_vars_and_dims(variables) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5f21027962f..5f555dde2d9 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1412,9 +1412,8 @@ def test_manual_chunk(self): # All of these should return non-chunked arrays NO_CHUNKS = (None, 0, {}) for no_chunk in NO_CHUNKS: - with self.roundtrip( - original, open_kwargs={'chunks': no_chunk, - 'auto_chunk': False}) as actual: + open_kwargs = {'chunks': no_chunk, 'auto_chunk': False} + with self.roundtrip(original, open_kwargs=open_kwargs) as actual: for k, v in actual.variables.items(): # only index variables should be in memory assert v._in_memory == (k in actual.dims) @@ -1424,9 +1423,8 @@ def test_manual_chunk(self): # uniform arrays for i in range(2, 6): rechunked = original.chunk(chunks=i) - - with self.roundtrip( - original, open_kwargs={'chunks': i}) as actual: + open_kwargs = {'chunks': i} + with self.roundtrip(original, open_kwargs=open_kwargs) as actual: for k, v in actual.variables.items(): # only index variables should be in memory assert v._in_memory == (k in actual.dims) @@ -1435,20 +1433,17 @@ def test_manual_chunk(self): chunks = {'dim1': 2, 'dim2': 3, 'dim3': 5} rechunked = original.chunk(chunks=chunks) - - open_overwritten = {'chunks': chunks, - 'overwrite_encoded_chunks': True} - - with self.roundtrip( - original, open_kwargs=open_overwritten) as actual: + + open_kwargs = {'chunks': chunks, 'overwrite_encoded_chunks': True} + with self.roundtrip(original, open_kwargs=open_kwargs) as actual: for k, v in actual.variables.items(): assert v.chunks == rechunked[k].chunks - - with self.roundtrip(actual, open_kwargs={'chunks': 'auto'}) as auto: + + with self.roundtrip(actual) as auto: # encoding should have changed for k, v in actual.variables.items(): assert v.chunks == rechunked[k].chunks - + assert_identical(actual, auto) assert_identical(actual.load(), auto.load()) From 447af8c0e7a65c3ecc62925aebb077006e93c643 Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Thu, 1 Nov 2018 06:18:37 +1100 Subject: [PATCH 21/30] removed whitespace --- xarray/backends/zarr.py | 8 ++++---- xarray/tests/test_backends.py | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 89f55400e82..7ed26e31151 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -439,7 +439,7 @@ def open_zarr(store, group=None, synchronizer=None, chunks=None, if not isinstance(chunks, (int, dict)): if chunks != 'auto' and chunks is not None: raise ValueError("chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks) + "Instead found %s. " % chunks) if not decode_cf: mask_and_scale = False @@ -474,12 +474,12 @@ def maybe_decode_store(store, lock=False): # adapted from Dataset.Chunk() if isinstance(chunks, int): chunks = dict.fromkeys(ds.dims, chunks) - + def selkeys(dict_, keys): if dict_ is None: return None return dict((d, dict_[d]) for d in keys if d in dict_) - + def maybe_chunk(name, var, chunks): from dask.base import tokenize @@ -487,7 +487,7 @@ def maybe_chunk(name, var, chunks): chunks = var.encoding.get('chunks') else: chunks = selkeys(chunks, var.dims) - + if (var.ndim > 0) and (chunks is not None): # does this cause any data to be read? token2 = tokenize(name, var._data) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5f555dde2d9..01ef1caea38 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1405,10 +1405,10 @@ def test_auto_chunk(self): assert v._in_memory == (k in actual.dims) # chunk size should be the same as original assert v.chunks == original[k].chunks - + def test_manual_chunk(self): original = create_test_data().chunk({'dim1': 3, 'dim2': 4, 'dim3': 3}) - + # All of these should return non-chunked arrays NO_CHUNKS = (None, 0, {}) for no_chunk in NO_CHUNKS: @@ -1430,20 +1430,20 @@ def test_manual_chunk(self): assert v._in_memory == (k in actual.dims) # chunk size should be the same as rechunked assert v.chunks == rechunked[k].chunks - + chunks = {'dim1': 2, 'dim2': 3, 'dim3': 5} rechunked = original.chunk(chunks=chunks) - + open_kwargs = {'chunks': chunks, 'overwrite_encoded_chunks': True} with self.roundtrip(original, open_kwargs=open_kwargs) as actual: for k, v in actual.variables.items(): assert v.chunks == rechunked[k].chunks - + with self.roundtrip(actual) as auto: # encoding should have changed for k, v in actual.variables.items(): assert v.chunks == rechunked[k].chunks - + assert_identical(actual, auto) assert_identical(actual.load(), auto.load()) From cdd23d4e37bfeaa1d3f18331539d9ddae8696530 Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Wed, 7 Nov 2018 13:22:51 +1100 Subject: [PATCH 22/30] added deprecation warning --- xarray/backends/zarr.py | 34 +++++++++++++++++++++++----------- xarray/tests/test_backends.py | 30 +++++++++++++++++++++++++----- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 7ed26e31151..302301248d3 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,4 +1,11 @@ +<<<<<<< HEAD from collections import OrderedDict +======= +from __future__ import absolute_import, division, print_function + +import warnings + +>>>>>>> added deprecation warning from distutils.version import LooseVersion import numpy as np @@ -352,7 +359,7 @@ def close(self): zarr.consolidate_metadata(self.ds.store) -def open_zarr(store, group=None, synchronizer=None, chunks=None, +def open_zarr(store, group=None, synchronizer=None, chunks='auto', decode_cf=True, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, drop_variables=None, consolidated=False, auto_chunk=True, @@ -379,13 +386,8 @@ def open_zarr(store, group=None, synchronizer=None, chunks=None, chunks : int or dict or {None, 'auto'}, optional Chunk sizes along each dimension, e.g., ``5`` or ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created - based on the variable's zarr chunks. If `chunks=None` and - `auto_chunk=False`, zarr array data will lazily convert to numpy - arrays upon access. - auto_chunk : bool, optional - Whether to automatically create dask chunks corresponding to each - variable's zarr chunks. If `chunks=None`, this overrides `chunks`. - Equivalent to `chunks='auto'.` (Default: True) + based on the variable's zarr chunks. If `chunks=None`, zarr array + data will lazily convert to numpy arrays upon access. overwrite_encoded_chunks: bool, optional Whether to drop the zarr chunks encoded for each variable when a dataset is loaded with specified chunk sizes (default: False) @@ -432,9 +434,19 @@ def open_zarr(store, group=None, synchronizer=None, chunks=None, ---------- http://zarr.readthedocs.io/ """ - - if auto_chunk and chunks is None: - chunks = 'auto' # maintain backwards compatibility + if 'auto_chunk' in kwargs: + auto_chunk = kwargs.pop('auto_chunk') + if auto_chunk == True: + chunks = 'auto' # maintain backwards compatibility + elif auto_chunk == False: + chunks = None + + warnings.warn("auto_chunk is deprecated. Use chunks='auto' instead.", + FutureWarning, stacklevel=2) + + if kwargs: + raise TypeError("open_zarr() got unexpected keyword arguments " + + ",".join(kwargs.keys())) if not isinstance(chunks, (int, dict)): if chunks != 'auto' and chunks is not None: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 01ef1caea38..21e5518f6de 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1391,7 +1391,7 @@ def test_auto_chunk(self): original = create_test_data().chunk() with self.roundtrip( - original, open_kwargs={'auto_chunk': False}) as actual: + original, open_kwargs={'chunks': None}) as actual: for k, v in actual.variables.items(): # only index variables should be in memory assert v._in_memory == (k in actual.dims) @@ -1399,7 +1399,7 @@ def test_auto_chunk(self): assert v.chunks is None with self.roundtrip( - original, open_kwargs={'auto_chunk': True}) as actual: + original, open_kwargs={'chunks': 'auto'}) as actual: for k, v in actual.variables.items(): # only index variables should be in memory assert v._in_memory == (k in actual.dims) @@ -1412,7 +1412,7 @@ def test_manual_chunk(self): # All of these should return non-chunked arrays NO_CHUNKS = (None, 0, {}) for no_chunk in NO_CHUNKS: - open_kwargs = {'chunks': no_chunk, 'auto_chunk': False} + open_kwargs = {'chunks': no_chunk} with self.roundtrip(original, open_kwargs=open_kwargs) as actual: for k, v in actual.variables.items(): # only index variables should be in memory @@ -1446,13 +1446,33 @@ def test_manual_chunk(self): assert_identical(actual, auto) assert_identical(actual.load(), auto.load()) + + def test_deprecate_auto_chunk(self): + original = create_test_data().chunk() + with pytest.warns(FutureWarning): + with self.roundtrip( + original, open_kwargs={'auto_chunk': True}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + # chunk size should be the same as original + assert v.chunks == original[k].chunks + + with pytest.warns(FutureWarning): + with self.roundtrip( + original, open_kwargs={'auto_chunk': False}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + # there should be no chunks + assert v.chunks is None + def test_write_uneven_dask_chunks(self): # regression for GH#2225 original = create_test_data().chunk({'dim1': 3, 'dim2': 4, 'dim3': 3}) - with self.roundtrip( - original, open_kwargs={'auto_chunk': True}) as actual: + original, open_kwargs={'chunks': 'auto'}) as actual: for k, v in actual.data_vars.items(): print(k) assert v.chunks == actual[k].chunks From 7099e70e80c7d81a0c34c8b3ccb2f0c05089dbdb Mon Sep 17 00:00:00 2001 From: Lily Wang Date: Wed, 7 Nov 2018 13:34:44 +1100 Subject: [PATCH 23/30] fixed pep8 issues --- xarray/backends/zarr.py | 4 ++-- xarray/tests/test_backends.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 302301248d3..14119660ad0 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -436,9 +436,9 @@ def open_zarr(store, group=None, synchronizer=None, chunks='auto', """ if 'auto_chunk' in kwargs: auto_chunk = kwargs.pop('auto_chunk') - if auto_chunk == True: + if auto_chunk: chunks = 'auto' # maintain backwards compatibility - elif auto_chunk == False: + else: chunks = None warnings.warn("auto_chunk is deprecated. Use chunks='auto' instead.", diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 21e5518f6de..3dac12b5727 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1446,27 +1446,27 @@ def test_manual_chunk(self): assert_identical(actual, auto) assert_identical(actual.load(), auto.load()) - + def test_deprecate_auto_chunk(self): original = create_test_data().chunk() with pytest.warns(FutureWarning): with self.roundtrip( - original, open_kwargs={'auto_chunk': True}) as actual: + original, open_kwargs={'auto_chunk': True}) as actual: for k, v in actual.variables.items(): # only index variables should be in memory assert v._in_memory == (k in actual.dims) # chunk size should be the same as original assert v.chunks == original[k].chunks - + with pytest.warns(FutureWarning): with self.roundtrip( - original, open_kwargs={'auto_chunk': False}) as actual: + original, open_kwargs={'auto_chunk': False}) as actual: for k, v in actual.variables.items(): # only index variables should be in memory assert v._in_memory == (k in actual.dims) # there should be no chunks assert v.chunks is None - + def test_write_uneven_dask_chunks(self): # regression for GH#2225 From 301953a1ddd3989193ff4f067ceda8b9ee6bbb12 Mon Sep 17 00:00:00 2001 From: Lily Date: Wed, 30 Jan 2019 11:06:35 +1100 Subject: [PATCH 24/30] added warning for bad chunks --- xarray/backends/zarr.py | 47 ++++++++++++++++++++++++++--------- xarray/tests/test_backends.py | 24 ++++++++++++++++++ 2 files changed, 59 insertions(+), 12 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 14119660ad0..9f10c6a3bb7 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -383,11 +383,12 @@ def open_zarr(store, group=None, synchronizer=None, chunks='auto', Array synchronizer provided to zarr group : str, obtional Group path. (a.k.a. `path` in zarr terminology.) - chunks : int or dict or {None, 'auto'}, optional + chunks : int or dict or tuple or {None, 'auto'}, optional Chunk sizes along each dimension, e.g., ``5`` or ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created based on the variable's zarr chunks. If `chunks=None`, zarr array - data will lazily convert to numpy arrays upon access. + data will lazily convert to numpy arrays upon access. This accepts + all the chunk specifications as Dask does. overwrite_encoded_chunks: bool, optional Whether to drop the zarr chunks encoded for each variable when a dataset is loaded with specified chunk sizes (default: False) @@ -486,25 +487,47 @@ def maybe_decode_store(store, lock=False): # adapted from Dataset.Chunk() if isinstance(chunks, int): chunks = dict.fromkeys(ds.dims, chunks) + + if isinstance(chunks, tuple) and len(chunks) == len(ds.dims): + chunks = dict(zip(ds.dims, chunks)) + + def get_chunk(name, var, chunks): + chunk_spec = dict(zip(var.dims, var.encoding.get('chunks'))) + + # Coordinate labels aren't chunked + if var.ndim == 1 and var.dims[0] == name: + return chunk_spec + + if chunks == 'auto': + return chunk_spec + + for dim in var.dims: + if dim in chunks: + spec = chunks[dim] + if isinstance(spec, int): + spec = (spec,) + if isinstance(spec, (tuple, list)) and chunk_spec[dim]: + if any(s % chunk_spec[dim] for s in spec): + print('ok any', spec, chunk_spec[dim], dim) + warnings.warn("Specified Dask chunks %r would " + "separate Zarr chunk shape %r for dimension %r. " + "This significantly degrades performance. " + "Consider rechunking after loading." + % (chunks[dim], chunk_spec[dim], dim)) + chunk_spec[dim] = chunks[dim] + return chunk_spec - def selkeys(dict_, keys): - if dict_ is None: - return None - return dict((d, dict_[d]) for d in keys if d in dict_) def maybe_chunk(name, var, chunks): from dask.base import tokenize - if chunks == 'auto': - chunks = var.encoding.get('chunks') - else: - chunks = selkeys(chunks, var.dims) + chunk_spec = get_chunk(name, var, chunks) - if (var.ndim > 0) and (chunks is not None): + if (var.ndim > 0) and (chunk_spec is not None): # does this cause any data to be read? token2 = tokenize(name, var._data) name2 = 'zarr-%s' % token2 - var = var.chunk(chunks, name=name2, lock=None) + var = var.chunk(chunk_spec, name=name2, lock=None) if overwrite_encoded_chunks and var.chunks is not None: var.encoding['chunks'] = tuple(x[0] for x in var.chunks) return var diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3dac12b5727..1b77da32528 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1446,6 +1446,30 @@ def test_manual_chunk(self): assert_identical(actual, auto) assert_identical(actual.load(), auto.load()) + + def test_warning_on_bad_chunks(self): + original = create_test_data().chunk({'dim1': 4, 'dim2': 3, 'dim3': 5}) + + bad_chunks = (2, {'dim2':(3, 3, 2, 1)}) + for chunks in bad_chunks: + kwargs = {'chunks': chunks} + with pytest.warns(UserWarning): + with self.roundtrip(original, open_kwargs=kwargs) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + + good_chunks = ({'dim2': 3}, {'dim3': 10}) + for chunks in good_chunks: + kwargs = {'chunks': chunks} + with pytest.warns(None) as record: + with self.roundtrip(original, open_kwargs=kwargs) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + assert v._in_memory == (k in actual.dims) + assert len(record) == 0 + + def test_deprecate_auto_chunk(self): original = create_test_data().chunk() From 8e61e7e3ff37a8ebdb42b9918e2a694a037f5342 Mon Sep 17 00:00:00 2001 From: Lily Date: Wed, 30 Jan 2019 11:34:50 +1100 Subject: [PATCH 25/30] fixed lingering rebase conflicts --- doc/whats-new.rst | 11 +++++------ xarray/backends/zarr.py | 26 +++++++++----------------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 5d32fffc53a..ed3d2d60442 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -133,6 +133,11 @@ Other enhancements By `Keisuke Fujii `_. - Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`). By `Kevin Squire `_. +- ``xr.open_zarr`` now accepts manually specified chunks with the ``chunks=`` + parameter. ``auto_chunk=True`` is equivalent to ``chunks='auto'`` for + backwards compatibility. The ``overwrite_encoded_chunks`` parameter is + added to remove the original zarr chunk encoding. + By `Lily Wang `_. Bug fixes ~~~~~~~~~ @@ -440,12 +445,6 @@ Bug fixes encoding process if a reference date is used that is so distant that the dates must be encoded using cftime rather than NumPy (:issue:`2272`). By `Spencer Clark `_. - -- ``xr.open_zarr`` now accepts manually specified chunks with the ``chunks=`` - parameter. ``auto_chunk=True`` is equivalent to ``chunks='auto'`` for - backwards compatibility. The ``overwrite_encoded_chunks`` parameter is - added to remove the original zarr chunk encoding. - By `Lily Wang `_. - Chunked datasets can now roundtrip to Zarr storage continually with `to_zarr` and ``open_zarr`` (:issue:`2300`). diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 9f10c6a3bb7..87507ece201 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,11 +1,5 @@ -<<<<<<< HEAD -from collections import OrderedDict -======= -from __future__ import absolute_import, division, print_function - import warnings - ->>>>>>> added deprecation warning +from collections import OrderedDict from distutils.version import LooseVersion import numpy as np @@ -362,8 +356,8 @@ def close(self): def open_zarr(store, group=None, synchronizer=None, chunks='auto', decode_cf=True, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, - drop_variables=None, consolidated=False, auto_chunk=True, - overwrite_encoded_chunks=False): + drop_variables=None, consolidated=False, + overwrite_encoded_chunks=False, **kwargs): """Load and decode a dataset from a Zarr store. .. note:: Experimental @@ -387,7 +381,7 @@ def open_zarr(store, group=None, synchronizer=None, chunks='auto', Chunk sizes along each dimension, e.g., ``5`` or ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created based on the variable's zarr chunks. If `chunks=None`, zarr array - data will lazily convert to numpy arrays upon access. This accepts + data will lazily convert to numpy arrays upon access. This accepts all the chunk specifications as Dask does. overwrite_encoded_chunks: bool, optional Whether to drop the zarr chunks encoded for each variable when a @@ -487,7 +481,7 @@ def maybe_decode_store(store, lock=False): # adapted from Dataset.Chunk() if isinstance(chunks, int): chunks = dict.fromkeys(ds.dims, chunks) - + if isinstance(chunks, tuple) and len(chunks) == len(ds.dims): chunks = dict(zip(ds.dims, chunks)) @@ -508,16 +502,14 @@ def get_chunk(name, var, chunks): spec = (spec,) if isinstance(spec, (tuple, list)) and chunk_spec[dim]: if any(s % chunk_spec[dim] for s in spec): - print('ok any', spec, chunk_spec[dim], dim) warnings.warn("Specified Dask chunks %r would " - "separate Zarr chunk shape %r for dimension %r. " - "This significantly degrades performance. " - "Consider rechunking after loading." - % (chunks[dim], chunk_spec[dim], dim)) + "separate Zarr chunk shape %r for dimension %r. " + "This significantly degrades performance. " + "Consider rechunking after loading." + % (chunks[dim], chunk_spec[dim], dim)) chunk_spec[dim] = chunks[dim] return chunk_spec - def maybe_chunk(name, var, chunks): from dask.base import tokenize From 8fd65ea9f7ee841446e3ba1b287239f6cf4f0a16 Mon Sep 17 00:00:00 2001 From: Lily Date: Wed, 30 Jan 2019 11:38:14 +1100 Subject: [PATCH 26/30] fixed pep8 issues --- xarray/backends/zarr.py | 9 +++++---- xarray/tests/test_backends.py | 7 ++----- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 87507ece201..d0696f20499 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -503,10 +503,11 @@ def get_chunk(name, var, chunks): if isinstance(spec, (tuple, list)) and chunk_spec[dim]: if any(s % chunk_spec[dim] for s in spec): warnings.warn("Specified Dask chunks %r would " - "separate Zarr chunk shape %r for dimension %r. " - "This significantly degrades performance. " - "Consider rechunking after loading." - % (chunks[dim], chunk_spec[dim], dim)) + "separate Zarr chunk shape %r for " + "dimension %r. This significantly " + "degrades performance. Consider " + "rechunking after loading instead." + % (chunks[dim], chunk_spec[dim], dim)) chunk_spec[dim] = chunks[dim] return chunk_spec diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 1b77da32528..5efcdf9cd98 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1446,11 +1446,11 @@ def test_manual_chunk(self): assert_identical(actual, auto) assert_identical(actual.load(), auto.load()) - + def test_warning_on_bad_chunks(self): original = create_test_data().chunk({'dim1': 4, 'dim2': 3, 'dim3': 5}) - bad_chunks = (2, {'dim2':(3, 3, 2, 1)}) + bad_chunks = (2, {'dim2': (3, 3, 2, 1)}) for chunks in bad_chunks: kwargs = {'chunks': chunks} with pytest.warns(UserWarning): @@ -1469,8 +1469,6 @@ def test_warning_on_bad_chunks(self): assert v._in_memory == (k in actual.dims) assert len(record) == 0 - - def test_deprecate_auto_chunk(self): original = create_test_data().chunk() with pytest.warns(FutureWarning): @@ -1491,7 +1489,6 @@ def test_deprecate_auto_chunk(self): # there should be no chunks assert v.chunks is None - def test_write_uneven_dask_chunks(self): # regression for GH#2225 original = create_test_data().chunk({'dim1': 3, 'dim2': 4, 'dim3': 3}) From 4bb164d619cb6f0eacaaaf6122d7f82653d2851c Mon Sep 17 00:00:00 2001 From: Lily Date: Thu, 4 Apr 2019 12:39:26 +1100 Subject: [PATCH 27/30] added stacklevel --- xarray/backends/zarr.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index d0696f20499..e20140ee248 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -212,7 +212,7 @@ def encode_zarr_variable(var, needs_copy=True, name=None): # zarr allows unicode, but not variable-length strings, so it's both # simpler and more compact to always encode as UTF-8 explicitly. # TODO: allow toggling this explicitly via dtype in encoding. - coder = coding.strings.EncodedStringCoder(allows_unicode=False) + coder = coding.strings.EncodedStringCoder(allows_unicode=True) var = coder.encode(var, name=name) var = coding.strings.ensure_fixed_length_bytes(var) @@ -507,7 +507,8 @@ def get_chunk(name, var, chunks): "dimension %r. This significantly " "degrades performance. Consider " "rechunking after loading instead." - % (chunks[dim], chunk_spec[dim], dim)) + % (chunks[dim], chunk_spec[dim], dim), + stacklevel=2) chunk_spec[dim] = chunks[dim] return chunk_spec From 485717d285c3afb55bf65cda5cd1f1062bab04e8 Mon Sep 17 00:00:00 2001 From: Lily Date: Thu, 4 Apr 2019 12:44:45 +1100 Subject: [PATCH 28/30] fixed pep8 issues --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5efcdf9cd98..bf40e529931 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1437,7 +1437,7 @@ def test_manual_chunk(self): open_kwargs = {'chunks': chunks, 'overwrite_encoded_chunks': True} with self.roundtrip(original, open_kwargs=open_kwargs) as actual: for k, v in actual.variables.items(): - assert v.chunks == rechunked[k].chunks + assert v.chunks == rechunked[k].chunks with self.roundtrip(actual) as auto: # encoding should have changed From b0e1e1e68d80d4efb334ab40515e9282726a6407 Mon Sep 17 00:00:00 2001 From: Lily Date: Fri, 12 Apr 2019 11:34:02 +1000 Subject: [PATCH 29/30] disallow unicode again --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index e20140ee248..f5364314af8 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -212,7 +212,7 @@ def encode_zarr_variable(var, needs_copy=True, name=None): # zarr allows unicode, but not variable-length strings, so it's both # simpler and more compact to always encode as UTF-8 explicitly. # TODO: allow toggling this explicitly via dtype in encoding. - coder = coding.strings.EncodedStringCoder(allows_unicode=True) + coder = coding.strings.EncodedStringCoder(allows_unicode=False) var = coder.encode(var, name=name) var = coding.strings.ensure_fixed_length_bytes(var) From f17cb5e99779acda42e211a5d18868aa168cef3b Mon Sep 17 00:00:00 2001 From: Lily Date: Fri, 12 Apr 2019 11:37:49 +1000 Subject: [PATCH 30/30] disallow unicode again --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index e20140ee248..f5364314af8 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -212,7 +212,7 @@ def encode_zarr_variable(var, needs_copy=True, name=None): # zarr allows unicode, but not variable-length strings, so it's both # simpler and more compact to always encode as UTF-8 explicitly. # TODO: allow toggling this explicitly via dtype in encoding. - coder = coding.strings.EncodedStringCoder(allows_unicode=True) + coder = coding.strings.EncodedStringCoder(allows_unicode=False) var = coder.encode(var, name=name) var = coding.strings.ensure_fixed_length_bytes(var)