From 2bea4497ca4da273a2f57abaa51decc5d572dc15 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 31 Mar 2022 16:09:43 +0100 Subject: [PATCH 1/3] compressed --- cf/data/data.py | 69 +++++++++++--------------------------------- cf/test/test_Data.py | 54 +++++++++++++++++----------------- 2 files changed, 44 insertions(+), 79 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index c93d48f7d4..7b51527c14 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -976,7 +976,7 @@ def __len__(self): """ dx = self._get_dask() if math.isnan(dx.size): - logger.warning("Computing data len: Performance may be degraded") + logger.debug("Computing data len: Performance may be degraded") dx.compute_chunk_sizes() return len(dx) @@ -6274,9 +6274,7 @@ def nbytes(self): """ dx = self._get_dask() if math.isnan(dx.size): - logger.warning( - "Computing data nbytes: Performance may be degraded" - ) + logger.debug("Computing data nbytes: Performance may be degraded") dx.compute_chunk_sizes() return dx.nbytes @@ -6343,7 +6341,7 @@ def shape(self): """ dx = self._get_dask() if math.isnan(dx.size): - logger.warning("Computing data shape: Performance may be degraded") + logger.debug("Computing data shape: Performance may be degraded") dx.compute_chunk_sizes() return dx.shape @@ -6384,7 +6382,7 @@ def size(self): dx = self._get_dask() size = dx.size if math.isnan(size): - logger.warning("Computing data size: Performance may be degraded") + logger.debug("Computing data size: Performance may be degraded") dx.compute_chunk_sizes() size = dx.size @@ -8590,6 +8588,7 @@ def close(self): for partition in self.partitions.matrix.flat: partition.file_close() + @daskified(_DASKIFIED_VERBOSE) @_inplace_enabled(default=False) def compressed(self, inplace=False): """Return all non-masked values in a one dimensional data array. @@ -8613,7 +8612,7 @@ def compressed(self, inplace=False): **Examples** - >>> d = cf.Data(numpy.arange(12).reshape(3, 4)) + >>> d = cf.Data(numpy.arange(12).reshape(3, 4), 'm') >>> print(d.array) [[ 0 1 2 3] [ 4 5 6 7] @@ -8630,58 +8629,24 @@ def compressed(self, inplace=False): [ 0 1 2 3 4 6 7 8 9 10] >>> d = cf.Data(9) - >>> print(d.array) - 9 >>> print(d.compressed().array) - 9 + [9] """ d = _inplace_enabled_define_and_cleanup(self) - ndim = d.ndim - - if ndim != 1: - d.flatten(inplace=True) - - n_non_missing = d.count() - if n_non_missing == d.size: - return d - - comp = self.empty( - shape=(n_non_missing,), dtype=self.dtype, units=self.Units + dx = d._get_dask().ravel() + dx = da.blockwise( + np.ma.compressed, + "i", + dx.ravel(), + "i", + adjust_chunks={"i": lambda n: np.nan}, + dtype=dx.dtype, + meta=np.array((), dtype=dx.dtype), ) - # Find the number of array elements that fit in one chunk - n = int(cf_chunksize() // (self.dtype.itemsize + 1.0)) - - # Loop around each chunk's worth of elements and assign the - # non-missing values to the compressed data - i = 0 - start = 0 - for _ in range(1 + d.size // n): - if i >= d.size: - break - - array = d[i : i + n].array - if np.ma.isMA(array): - array = array.compressed() - - size = array.size - if size >= 1: - end = start + size - comp[start:end] = array - start = end - - i += n - - if not d.ndim: - comp.squeeze(inplace=True) - - if inplace: - d.__dict__ = comp.__dict__ - else: - d = comp - + d._set_dask(dx, reset_mask_hardness=False) return d @daskified(_DASKIFIED_VERBOSE) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 7e991b8f68..d1653289b0 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -671,40 +671,40 @@ def test_Data_diff(self): self.assertTrue((a_diff == d_diff).all()) self.assertTrue((a_diff.mask == d_diff.mask).all()) - @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_ndim'") def test_Data_compressed(self): - if self.test_only and inspect.stack()[0][3] not in self.test_only: - return - a = np.ma.arange(12).reshape(3, 4) + d = cf.Data(a, "m", chunks=2) + e = d.compressed(inplace=True) + self.assertIsNone(d.compressed(inplace=True)) + self.assertEqual(d.shape, (a.size,)) + self.assertEqual(d.Units, cf.Units("m")) + self.assertEqual(d.dtype, a.dtype) - d = cf.Data(a) - self.assertTrue((d.array == a).all()) - self.assertTrue((a.compressed() == d.compressed()).all()) - - e = d.copy() - x = e.compressed(inplace=True) - self.assertIsNone(x) - self.assertTrue(e.equals(d.compressed())) - - a[1, 1] = np.ma.masked - a[2, 3] = np.ma.masked + d = cf.Data(a, "m", chunks=2) + self.assertTrue((d.compressed().array == a.compressed()).all()) - d = cf.Data(a) - self.assertTrue((d.array == a).all()) - self.assertTrue((d.mask.array == a.mask).all()) - self.assertTrue((a.compressed() == d.compressed()).all()) + a[2] = np.ma.masked + d = cf.Data(a, "m", chunks=2) + self.assertTrue((d.compressed().array == a.compressed()).all()) - e = d.copy() - x = e.compressed(inplace=True) - self.assertIsNone(x) - self.assertTrue(e.equals(d.compressed())) + a[...] = np.ma.masked + d = cf.Data(a, "m", chunks=2) + e = d.compressed() + self.assertEqual(e.shape, (0,)) + self.assertTrue((e.array == a.compressed()).all()) - d = cf.Data(self.a, "km") - self.assertTrue((self.a.flatten() == d.compressed()).all()) + # Scalar arrays + a = np.ma.array(9) + d = cf.Data(a, "m") + e = d.compressed() + self.assertEqual(e.shape, (1,)) + self.assertTrue((e.array == a.compressed()).all()) - d = cf.Data(self.ma, "km") - self.assertTrue((self.ma.compressed() == d.compressed()).all()) + a = np.ma.array(9, mask=True) + d = cf.Data(a, "m") + e = d.compressed() + self.assertEqual(e.shape, (0,)) + self.assertTrue((e.array == a.compressed()).all()) @unittest.skipIf(TEST_DASKIFIED_ONLY, "no attribute '_shape'") def test_Data_stats(self): From 0ae049d9f7cdbf0c9a6cad4b55ca670e05ff5a18 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 20 Apr 2022 20:46:31 +0100 Subject: [PATCH 2/3] Remove unnecessary compressed operation from test Co-authored-by: Sadie L. Bartholomew --- cf/test/test_Data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 1c452e93cb..76fbcf931f 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -667,7 +667,6 @@ def test_Data_diff(self): def test_Data_compressed(self): a = np.ma.arange(12).reshape(3, 4) d = cf.Data(a, "m", chunks=2) - e = d.compressed(inplace=True) self.assertIsNone(d.compressed(inplace=True)) self.assertEqual(d.shape, (a.size,)) self.assertEqual(d.Units, cf.Units("m")) From c9b241890f812a489a306e47a8f31efcaf1269d9 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 20 Apr 2022 20:47:29 +0100 Subject: [PATCH 3/3] remove unnecessary ravel from compressed --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index 9b15591d73..aabbad294b 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -8625,7 +8625,7 @@ def compressed(self, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = d._get_dask().ravel() + dx = d._get_dask() dx = da.blockwise( np.ma.compressed, "i",