From fee3e91c75e332d6f64eeb610fb05576e0d90a5a Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 29 Mar 2022 09:47:22 +0100 Subject: [PATCH 1/6] dask-hash --- cf/data/data.py | 54 +--------------------- cf/data/mixin/deprecations.py | 61 +++++++++++++++++++++++++ cf/functions.py | 86 ++++++++++++++++------------------- cf/test/test_functions.py | 19 ++++++++ 4 files changed, 121 insertions(+), 99 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index c93d48f7d4..45e7f9422a 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -40,7 +40,7 @@ from ..functions import chunksize as cf_chunksize from ..functions import default_netCDF_fillvals from ..functions import fm_threshold as cf_fm_threshold -from ..functions import free_memory, hash_array +from ..functions import free_memory from ..functions import inspect as cf_inspect from ..functions import log_level, parse_indices, pathjoin from ..functions import rtol as cf_rtol @@ -812,58 +812,6 @@ def __data__(self): """Returns a new reference to self.""" return self - def __hash__(self): - """The built-in function `hash` - - Generating the hash temporarily realizes the entire array in - memory, which may not be possible for large arrays. - - The hash value is dependent on the data-type and shape of the data - array. If the array is a masked array then the hash value is - independent of the fill value and of data array values underlying - any masked elements. - - The hash value may be different if regenerated after the data - array has been changed in place. - - The hash value is not guaranteed to be portable across versions of - Python, numpy and cf. - - :Returns: - - `int` - The hash value. - - **Examples:** - - >>> print(d.array) - [[0 1 2 3]] - >>> d.hash() - -8125230271916303273 - >>> d[1, 0] = numpy.ma.masked - >>> print(d.array) - [[0 -- 2 3]] - >>> hash(d) - 791917586613573563 - >>> d.hardmask = False - >>> d[0, 1] = 999 - >>> d[0, 1] = numpy.ma.masked - >>> d.hash() - 791917586613573563 - >>> d.squeeze() - >>> print(d.array) - [0 -- 2 3] - >>> hash(d) - -7007538450787927902 - >>> d.dtype = float - >>> print(d.array) - [0.0 -- 2.0 3.0] - >>> hash(d) - -4816859207969696442 - - """ - return hash_array(self.array) - @daskified(_DASKIFIED_VERBOSE) def __float__(self): """Called to implement the built-in function `float` diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py index 205e016333..e267d25884 100644 --- a/cf/data/mixin/deprecations.py +++ b/cf/data/mixin/deprecations.py @@ -7,6 +7,67 @@ class DataClassDeprecationsMixin: """Deprecated attributes and methods for the Data class.""" + def __hash__(self): + """The built-in function `hash` + + Depreacted at version TODODASK. Consider using the + `cf.hash_array` function instead. + + Generating the hash temporarily realizes the entire array in + memory, which may not be possible for large arrays. + + The hash value is dependent on the data-type and shape of the data + array. If the array is a masked array then the hash value is + independent of the fill value and of data array values underlying + any masked elements. + + The hash value may be different if regenerated after the data + array has been changed in place. + + The hash value is not guaranteed to be portable across versions of + Python, numpy and cf. + + :Returns: + + `int` + The hash value. + + **Examples** + + >>> print(d.array) + [[0 1 2 3]] + >>> d.hash() + -8125230271916303273 + >>> d[1, 0] = numpy.ma.masked + >>> print(d.array) + [[0 -- 2 3]] + >>> hash(d) + 791917586613573563 + >>> d.hardmask = False + >>> d[0, 1] = 999 + >>> d[0, 1] = numpy.ma.masked + >>> d.hash() + 791917586613573563 + >>> d.squeeze() + >>> print(d.array) + [0 -- 2 3] + >>> hash(d) + -7007538450787927902 + >>> d.dtype = float + >>> print(d.array) + [0.0 -- 2.0 3.0] + >>> hash(d) + -4816859207969696442 + + """ + _DEPRECATION_ERROR_METHOD( + self, + "__hash__", + message="Consider using 'cf.hash_array' instead.", + version="TODODASK", + removed_at="5.0.0", + ) + @property def Data(self): """Deprecated at version 3.0.0, use attribute `data` instead.""" diff --git a/cf/functions.py b/cf/functions.py index 070f30f84e..a4d057f0ad 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -1,6 +1,7 @@ import atexit import csv import ctypes.util +import hashlib import importlib import os import platform @@ -10,8 +11,7 @@ import urllib.parse import warnings from collections.abc import Iterable -from hashlib import md5 as hashlib_md5 -from marshal import dumps as marshal_dumps +from marshal import dumps from math import ceil as math_ceil from numbers import Integral from os import getpid, listdir, mkdir @@ -23,14 +23,12 @@ from os.path import relpath as _os_path_relpath import cfdm - -# import cPickle import netCDF4 +import numpy as np from numpy import __file__ as _numpy__file__ from numpy import __version__ as _numpy__version__ from numpy import all as _numpy_all from numpy import allclose as _x_numpy_allclose -from numpy import ascontiguousarray as _numpy_ascontiguousarray from numpy import isclose as _x_numpy_isclose from numpy import shape as _numpy_shape from numpy import take as _numpy_take @@ -2611,69 +2609,66 @@ def pathjoin(path1, path2): return _os_path_join(path1, path2) -def hash_array(array): - """Return the hash value of a numpy array. +def hash_array(array, algorithm=hashlib.sha1): + """Return a hash value of a numpy array. - The hash value is dependent on the data type, shape of the data + The hash value is dependent on the data type and the shape of the array. If the array is a masked array then the hash value is independent of the fill value and of data array values underlying any masked elements. - The hash value is not guaranteed to be portable across versions of - Python, numpy and cf. - :Parameters: array: `numpy.ndarray` The numpy array to be hashed. May be a masked array. + algorthim: `hashlib` constructor function + Constructor function for the desired hash algorithm, + e.g. `hashlib.md5`, `hashlib.sha256`, etc. + + .. versionadded:: TODODASK + :Returns: `int` The hash value. - **Examples:** + **Examples** + + >>> a = np.array([[0, 1, 2, 3]]) + >>> cf.hash_array(a) + -5620332080097671134 - >>> print(array) - [[0 1 2 3]] + >>> a = np.ma.array([[0, 1, 2, 3]], mask=[[0, 1, 0, 0]]) >>> cf.hash_array(array) - -8125230271916303273 - >>> array[1, 0] = numpy.ma.masked - >>> print(array) + 8372868545804866378 + + >>> a[0, 1] = 999 + >>> a[0, 1] = np.ma.masked + >>> print(a) [[0 -- 2 3]] - >>> cf.hash_array(array) - 791917586613573563 - >>> array.hardmask = False - >>> array[0, 1] = 999 - >>> array[0, 1] = numpy.ma.masked - >>> cf.hash_array(array) - 791917586613573563 - >>> array.squeeze() - >>> print(array) - [0 -- 2 3] - >>> cf.hash_array(array) - -7007538450787927902 - >>> array.dtype = float - >>> print(array) - [0.0 -- 2.0 3.0] - >>> cf.hash_array(array) - -4816859207969696442 + >>> print(a.data) + [[ 0 999 2 3]] + >>> cf.hash_array(a) + 8372868545804866378 - """ - h = hashlib_md5() + >>> a = a.astype(float) + >>> cf.hash_array(a) + 5950106833921144220 - h_update = h.update + """ + h = algorithm() - h_update(marshal_dumps(array.dtype.name)) - h_update(marshal_dumps(array.shape)) + h.update(dumps(array.dtype.name)) + h.update(dumps(array.shape)) - if _numpy_ma_isMA(array): - if _numpy_ma_is_masked(array): + if np.ma.isMA(array): + if np.ma.is_masked(array): mask = array.mask if not mask.flags.c_contiguous: - mask = _numpy_ascontiguousarray(mask) + mask = np.ascontiguousarray(mask) - h_update(mask) + h.update(mask) array = array.copy() array.set_fill_value() array = array.filled() @@ -2681,10 +2676,9 @@ def hash_array(array): array = array.data if not array.flags.c_contiguous: - # array = array.copy() - array = _numpy_ascontiguousarray(array) + array = np.ascontiguousarray(array) - h_update(array) + h.update(array) return hash(h.digest()) diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index 98fb157c1d..0ba1db53b0 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -5,6 +5,8 @@ import sys import unittest +import numpy as np + faulthandler.enable() # to debug seg faults and timeouts import cf @@ -312,6 +314,23 @@ def test_environment(self): ]: self.assertIn(component, ep) + def test_hash_array(self): + import hashlib + + a = np.ma.array([[0, 1, 2, 3], [0, 1, 2, 3]]) + a[0, 0] = np.ma.masked + a = a.transpose() + + self.assertFalse(a.flags.c_contiguous) + self.assertFalse(a.mask.flags.c_contiguous) + + h = cf.hash_array(a) + self.assertIsInstance(h, int) + self.assertNotEqual(cf.hash_array(a, algorithm=hashlib.sha256), h) + + a.set_fill_value(a.fill_value + 1) + self.assertEqual(cf.hash_array(a), h) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) From 9d4d2564abb943a7b729392c025b6911bbbaba7b Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 1 Apr 2022 08:31:16 +0100 Subject: [PATCH 2/6] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/mixin/deprecations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py index e267d25884..7e9e26a2a1 100644 --- a/cf/data/mixin/deprecations.py +++ b/cf/data/mixin/deprecations.py @@ -8,7 +8,7 @@ class DataClassDeprecationsMixin: """Deprecated attributes and methods for the Data class.""" def __hash__(self): - """The built-in function `hash` + """The built-in function `hash`. Depreacted at version TODODASK. Consider using the `cf.hash_array` function instead. From ed5a1c780876954ef3c3dd94e0981922fbb5edce Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 1 Apr 2022 08:31:28 +0100 Subject: [PATCH 3/6] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/mixin/deprecations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py index 7e9e26a2a1..d092e589ea 100644 --- a/cf/data/mixin/deprecations.py +++ b/cf/data/mixin/deprecations.py @@ -10,7 +10,7 @@ class DataClassDeprecationsMixin: def __hash__(self): """The built-in function `hash`. - Depreacted at version TODODASK. Consider using the + Deprecated at version TODODASK. Consider using the `cf.hash_array` function instead. Generating the hash temporarily realizes the entire array in From 45a1e3b96861a22a94b3375912e773b51ecbd6fd Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 1 Apr 2022 08:31:40 +0100 Subject: [PATCH 4/6] Typo Co-authored-by: Sadie L. Bartholomew --- cf/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/functions.py b/cf/functions.py index a4d057f0ad..ce4ffeba19 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -2622,7 +2622,7 @@ def hash_array(array, algorithm=hashlib.sha1): array: `numpy.ndarray` The numpy array to be hashed. May be a masked array. - algorthim: `hashlib` constructor function + algorithm: `hashlib` constructor function Constructor function for the desired hash algorithm, e.g. `hashlib.md5`, `hashlib.sha256`, etc. From dccdc38d536a2acd0f6a43b72de57dec2dc38e1c Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 1 Apr 2022 08:31:52 +0100 Subject: [PATCH 5/6] Typo Co-authored-by: Sadie L. Bartholomew --- cf/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/functions.py b/cf/functions.py index ce4ffeba19..d6a76cfc0e 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -2626,7 +2626,7 @@ def hash_array(array, algorithm=hashlib.sha1): Constructor function for the desired hash algorithm, e.g. `hashlib.md5`, `hashlib.sha256`, etc. - .. versionadded:: TODODASK + .. versionadded:: TODODASK :Returns: From 8299db6f34677d2ebfd81cfd63b69cc52c1f3d14 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 1 Apr 2022 08:37:18 +0100 Subject: [PATCH 6/6] deprecation message --- cf/data/mixin/deprecations.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py index d092e589ea..3d4327c672 100644 --- a/cf/data/mixin/deprecations.py +++ b/cf/data/mixin/deprecations.py @@ -63,7 +63,8 @@ def __hash__(self): _DEPRECATION_ERROR_METHOD( self, "__hash__", - message="Consider using 'cf.hash_array' instead.", + message="Consider using 'cf.hash_array' on the underlying " + "array instead.", version="TODODASK", removed_at="5.0.0", )