From c5725a9f5e98a248212a7f2eb1d9b35bf3ce8994 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 10 Apr 2020 20:34:47 -0700
Subject: [PATCH 01/11] Add sm arch 80 to Makefile

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 4ee71c9478b1..7494ada0c258 100644
--- a/Makefile
+++ b/Makefile
@@ -429,7 +429,7 @@ endif
 # be JIT-compiled by the updated driver from the included PTX.
 ifeq ($(USE_CUDA), 1)
 ifeq ($(CUDA_ARCH),)
-	KNOWN_CUDA_ARCHS := 30 35 50 52 60 61 70 75
+	KNOWN_CUDA_ARCHS := 30 35 50 52 60 61 70 75 80
 	# Run nvcc on a zero-length file to check architecture-level support.
 	# Create args to include SASS in the fat binary for supported levels.
 	CUDA_ARCH := $(foreach arch,$(KNOWN_CUDA_ARCHS), \

From b6db51b5ffbcdaf24ea6d327977fdcb76b91e7ce Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sun, 19 Jul 2020 14:12:50 -0700
Subject: [PATCH 02/11] Unittest tolerance handling improvements (#18694)

* Add sm arch 80 to Makefile

* Add TF32 to cuBLAS GEMMs

Signed-off-by: Serge Panev <spanev@nvidia.com>

* Add CUDA version guards

Signed-off-by: Serge Panev <spanev@nvidia.com>

* Remove useless TF32 for double and old CUDA version

Signed-off-by: Serge Panev <spanev@nvidia.com>

* Factorize VERSION_ADJUSTED_TF32_MATH

Signed-off-by: Serge Panev <spanev@nvidia.com>

* Add TF32 considerations to test_util.py:check_consistency()

* Bypass test_gluon_gpu.py:test_large_models if gmem >32GB

* Default tols in assert_almost_equal() now a function of dtype and ctx

* Expand types listed by default_tols()

* Fix pylint

* All with_seed() tests to waitall in teardown

* Elevate MXNET_TEST_SEED logging to WARNING

* Revert test_gluon_gpu.py:test_rnn_layer to default tols

* Fix test_gluon_model_zoo_gpu.py::test_inference and test_operator_gpy.py::test_np_linalg_{solve,tensorinv}

* test_numpy_interoperability.py to not fix seed for rest of CI

* Further fix to test_np_linalg_tensorinv

* Fix test_gluon_data.py:test_dataloader_context when run on 1-GPU system.

* Fix test_operator_gpu.py::test_embedding_with_type

* Fix test_operator_gpu.py::{test_*convolution_large_c,test_np_linalg_tensorsolve}

* Remove unneeded print() from test_numpy_interoperability.py

* Unify tol handling of check_consistency() and assert_almost_equal().  Test tweeks.

* Add tol handling of assert_almost_equal() with number args

* Add tol handling of bool comparisons

* Fix test_numpy_op.py::test_np_random_rayleigh

* Fix test_operator_gpu.py::test_batchnorm_with_type

* Fix test_gluon.py::test_sync_batchnorm in cpu selftest

* Improve unittest failure reporting

* Add to robustness of test_operator_gpu.py::test_embedding_with_type

* Check_consistency() to use equal backward gradients for increased test robustness

* Fix test_operator_gpu.py::test_{fully_connected,gemm}.  Add default_numeric_eps().

* test_utils.py fix for numeric gradient calc

* Reinstate rtol=1e-2 for test_operator.py::test_order

* Remove auto-cast of check_consistency() input data to least precise dtype (not needed)

* Fix test_operator.py::test_{reciprocol,cbrt,rcbrt}_op

* Expand default float64 numeric_eps for test_operator_gpu.py::test_sofmin

* Fix segfault-on-error of @retry decorator. Add test isolation.

* assert_almost_equal() to handle a,b scalars

* Fix test_operator_gpu.py::test_gluon_{mvn,mvn_v1} race

* Fix test_operator_gpu.py::test_flatten_slice_after_conv via scale

* Remove test_utils.py:almost_equal_ignore_nan()

* Fix sample vs. pop variance issue with test_numpy_op.py::test_npx_batch_norm

* Expose test_utils.py:effective_dtype() and use to fix test_operator_gpu.py::test_np_linalg_svd

* Fix true_divide int_array / int_scalar -> float_array to honor np_default_dtype

* Try test_elemwise_binary_ops serial to avoid pytest worker crash

* Fix (log_)softmax backward on empty ndarray

* Temporarily log all CI seeds to troubleshoot seed non-determinism

* Revert "Temporarily log all CI seeds to troubleshoot seed non-determinism"

This reverts commit f60eff20785b812ac4fcd70d51359ee0cbfb3e47.

* Temp log all CI seeds to troubleshoot unwanted seed determinism

* Revert "Add sm arch 80 to Makefile"

This reverts commit f9306cecc53b0633ef5f5b7b000802fbf0d73fe9.

* Same fix of sample vs. pop variance issue, now with test_operator_gpu.py::test_batchnorm

* Revert "Temp log all CI seeds to troubleshoot unwanted seed determinism"

This reverts commit ff328efb0be3445690669d5437a6af575ff12b49.

* Marking test_sparse_dot_grad with garbage_expected after teardown error

* Fix flakiness of test_gluon_probability{_v1,_v2}.py::test_gluon_kl{_v1,}

* Temp skip of test_aggregate_duplication on gpu

* Add seeding to test_{numpy,}_contrib_gluon_data_vision.py.  Make created files unique.

* Add ndarray module isolation to help debug test_bbox_augmenters worker crash

* Marking test_sparse_square_sum serial after pytest worker crash

* Fix flakiness of test_gluon_probability{_v1,_v2}.py::test_half_cauchy{_v1,}

Co-authored-by: Serge Panev <spanev@nvidia.com>
Co-authored-by: Bart Gawrych <gawrych.bartlomiej@intel.com>
---
 python/mxnet/test_utils.py                    | 360 +++++++++++-------
 src/operator/linalg.h                         |   8 +
 src/operator/linalg_impl.h                    |  34 +-
 src/operator/numpy/np_true_divide-inl.h       |  19 +-
 tests/python/gpu/test_gluon_gpu.py            |  16 +-
 tests/python/gpu/test_gluon_model_zoo_gpu.py  |   2 +-
 tests/python/gpu/test_operator_gpu.py         |  95 ++---
 tests/python/unittest/common.py               |  14 +-
 tests/python/unittest/test_autograd.py        |   1 +
 tests/python/unittest/test_gluon.py           |   4 +-
 tests/python/unittest/test_ndarray.py         |   2 +-
 .../unittest/test_numpy_interoperability.py   |   7 +-
 tests/python/unittest/test_numpy_op.py        |  43 +--
 tests/python/unittest/test_operator.py        | 126 +++---
 tests/python/unittest/test_sparse_operator.py |   1 +
 15 files changed, 440 insertions(+), 292 deletions(-)

diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 9a24b5fd7d51..9e544547b1a1 100755
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -71,19 +71,110 @@ def default_dtype():
     # _TODO: get default dtype from environment variable
     return np.float32
 
+def default_rtols():
+    """Get default relative tolerances for data comparisons involving each data type."""
+    return {np.dtype(np.float16): 1e-2,
+            np.dtype(np.float32): 1e-4,
+            np.dtype(np.float64): 1e-5,
+            np.dtype(np.bool): 0,
+            np.dtype(np.int8): 0,
+            np.dtype(np.uint8): 0,
+            np.dtype(np.int32): 0,
+            np.dtype(np.uint32): 0,
+            np.dtype(np.int64): 0,
+            np.dtype(np.uint64): 0}
+
+def default_atols():
+    """Get default absolute tolerances for data comparisons involving each data type."""
+    return {np.dtype(np.float16): 1e-1,
+            np.dtype(np.float32): 1e-3,
+            np.dtype(np.float64): 1e-20,
+            np.dtype(np.bool): 0,
+            np.dtype(np.int8): 0,
+            np.dtype(np.uint8): 0,
+            np.dtype(np.int32): 0,
+            np.dtype(np.uint32): 0,
+            np.dtype(np.int64): 0,
+            np.dtype(np.uint64): 0}
+
+def default_numeric_eps():
+    """Get default epsilon for finite difference gradient calculations with data type."""
+    # prefer a power-of-two eps, since no bits are dropped when serving as an input delta
+    return {np.dtype(np.float16): 1.0 / 2**6,
+            np.dtype(np.float32): 1.0 / 2**9,
+            np.dtype(np.float64): 1.0 / 2**14}
+
+
+def effective_dtype(dat):
+    """ Return the most appropriate dtype for determining the tolerance used in dat comparisons
+    Parameters
+    ----------
+    dat : np.ndarray or mx.nd.array or mx.np.ndarray
+    """
+    # On arch 80 gpus, a float32-io gemm or conv op will trim the mantissa of data
+    # inputs to be of comparable precision to a float16, so float16 becomes the
+    # 'effective dtype' for tolerance tests involving such op outputs.
 
-def get_atol(atol=None):
-    """Get default numerical threshold for regression test."""
-    # _TODO: get from env variable, different threshold might
-    # be needed for different device and dtype
-    return 1e-20 if atol is None else atol
+    # Is TF32 enabled in the ctx (the default on arch 80 GPUs)
+    def is_TF32_enabled(ctx):
+        try:
+            return (ctx.device_type == 'gpu' and
+                    get_cuda_compute_capability(ctx) == 80 and
+                    os.environ.get('NVIDIA_TF32_OVERRIDE') != '0')
+        except:  # pylint: disable=bare-except
+            return False
+
+    ctx = dat.ctx if hasattr(dat, 'ctx') else None
+    dtype = np.dtype(dat.dtype)
+    if dtype == np.dtype(np.float32) and is_TF32_enabled(ctx):
+        return np.dtype(np.float16)
+    else:
+        return dtype
+
+
+def get_tolerance(dat, tol, default_tol):
+    """ Return the tolerance to be used for dat comparisons based on the given tol, datatype and context.
+    Parameters
+    ----------
+    dat : np.ndarray or mx.nd.array or mx.np.ndarray
+    tol : float, or a dict of dtype->float
+    default_tol : default dict of dtype->float for all types
+    """
+
+    if isinstance(tol, numbers.Number):
+        return tol
+
+    # If the caller has supplied a tol dict, use that if it has an entry for dtype,
+    # else use the supplied default tol dict.
+    dtype = effective_dtype(dat)
+    tol = {} if tol is None else tol
+    return tol.get(dtype, default_tol[dtype])
+
+
+def get_tols(x, y, rtol, atol):
+    """For comparing two datasets 'x' and 'y', what relative and absolute tolerances should be used."""
+    # Tolerance analysis needs 'dtype' of 'x' and 'y', so convert numbers to numpy scalars as needed
+    if isinstance(x, numbers.Number):
+        x = np.array(x)
+    if isinstance(y, numbers.Number):
+        y = np.array(y)
+
+    # If tols are not specified, use the largest default tol for 'x' and 'y' based on their ctx and dtype.
+    rtol = max(get_tolerance(x, rtol, default_rtols()),
+               get_tolerance(y, rtol, default_rtols()))
+    atol = max(get_tolerance(x, atol, default_atols()),
+               get_tolerance(y, atol, default_atols()))
 
+    return rtol, atol
 
-def get_rtol(rtol=None):
+
+def get_atol(atol=None, dtype=np.dtype(np.float64)):
     """Get default numerical threshold for regression test."""
-    # _TODO: get from env variable, different threshold might
-    # be needed for different device and dtype
-    return 1e-5 if rtol is None else rtol
+    return default_atols()[dtype] if atol is None else atol
+
+def get_rtol(rtol=None, dtype=np.dtype(np.float64)):
+    """Get default numerical threshold for regression test."""
+    return default_rtols()[dtype] if rtol is None else rtol
 
 def get_etol(etol=None):
     """Get default numerical threshold for regression test."""
@@ -513,10 +604,8 @@ def np_reduce(dat, axis, keepdims, numpy_reduce_func):
     return ret
 
 
-def find_max_violation(a, b, rtol=None, atol=None):
+def _find_max_violation(a, b, rtol, atol):
     """Finds and returns the location of maximum violation."""
-    rtol = get_rtol(rtol)
-    atol = get_atol(atol)
     # 'smart' absdiff that considers inf's as equals (to match np.allclose)
     absdiff = np.where(np.equal(a, b), 0, np.abs(a-b))
     tol = atol + rtol*np.abs(b)
@@ -579,9 +668,9 @@ def assert_almost_equal(a, b, rtol=None, atol=None, names=('a', 'b'), equal_nan=
     ----------
     a : np.ndarray or mx.nd.array
     b : np.ndarray or mx.nd.array
-    rtol : None or float
+    rtol : None or float or dict of dtype -> float
         The relative threshold. Default threshold will be used if set to ``None``.
-    atol : None or float
+    atol : None or float or dict of dtype -> float
         The absolute threshold. Default threshold will be used if set to ``None``.
     names : tuple of names, optional
         The names used in error message when an exception occurs
@@ -593,8 +682,12 @@ def assert_almost_equal(a, b, rtol=None, atol=None, names=('a', 'b'), equal_nan=
     if not use_broadcast:
         checkShapes(a, b)
 
-    rtol = get_rtol(rtol)
-    atol = get_atol(atol)
+    rtol, atol = get_tols(a, b, rtol, atol)
+
+    if isinstance(a, mx.numpy.ndarray):
+        a = a.asnumpy()
+    if isinstance(b, mx.numpy.ndarray):
+        b = b.asnumpy()
     use_np_allclose = isinstance(a, np.ndarray) and isinstance(b, np.ndarray)
     if not use_np_allclose:
         if not (hasattr(a, 'context') and hasattr(b, 'context') and a.context == b.context and a.dtype == b.dtype):
@@ -618,32 +711,37 @@ def assert_almost_equal(a, b, rtol=None, atol=None, names=('a', 'b'), equal_nan=
         a = a.asnumpy()
         b = b.asnumpy()
 
-    index, rel = find_max_violation(a, b, rtol, atol)
-    indexErr = index
-    relErr = rel
-
-    print('\n*** Maximum errors for vector of size {}:  rtol={}, atol={}\n'.format(a.size, rtol, atol))
-    aTmp = a.copy()
-    bTmp = b.copy()
-    i = 1
-    while i <= a.size:
-        if i <= mismatches[0]:
-            print("%3d: Error %f  %s" %(i, rel, locationError(a, b, index, names)))
+    index, rel = _find_max_violation(a, b, rtol, atol)
+    if index != ():
+        # a, b are the numpy arrays
+        indexErr = index
+        relErr = rel
+
+        print('\n*** Maximum errors for vector of size {}:  rtol={}, atol={}\n'.format(a.size, rtol, atol))
+        aTmp = a.copy()
+        bTmp = b.copy()
+        i = 1
+        while i <= a.size:
+            if i <= mismatches[0]:
+                print("%3d: Error %f  %s" %(i, rel, locationError(a, b, index, names)))
+
+            aTmp[index] = bTmp[index] = 0
+            if almost_equal(aTmp, bTmp, rtol, atol, equal_nan=equal_nan):
+                break
 
-        aTmp[index] = bTmp[index] = 0
-        if almost_equal(aTmp, bTmp, rtol, atol, equal_nan=equal_nan):
-            break
+            i += 1
+            if i <= mismatches[1] or mismatches[1] <= 0:
+                index, rel = _find_max_violation(aTmp, bTmp, rtol, atol)
+            else:
+                break
 
-        i += 1
-        if i <= mismatches[1] or mismatches[1] <= 0:
-            index, rel = find_max_violation(aTmp, bTmp, rtol, atol)
-        else:
-            break
+        mismatchDegree = "at least " if mismatches[1] > 0 and i > mismatches[1] else ""
+        errMsg = "Error %f exceeds tolerance rtol=%e, atol=%e (mismatch %s%f%%).\n%s" % \
+                 (relErr, rtol, atol, mismatchDegree, 100*i/a.size, \
+                  locationError(a, b, indexErr, names, maxError=True))
+    else:
+        errMsg = "Error %f exceeds tolerance rtol=%e, atol=%e.\n" % (rel, rtol, atol)
 
-    mismatchDegree = "at least " if mismatches[1] > 0 and i > mismatches[1] else ""
-    errMsg = "Error %f exceeds tolerance rtol=%e, atol=%e (mismatch %s%f%%).\n%s" % \
-             (relErr, rtol, atol, mismatchDegree, 100*i/a.size, \
-             locationError(a, b, indexErr, names, maxError=True))
     np.set_printoptions(threshold=4, suppress=True)
     msg = npt.build_err_msg([a, b], err_msg=errMsg)
 
@@ -662,16 +760,25 @@ def assert_almost_equal_with_err(a, b, rtol=None, atol=None, etol=None,
     ----------
     a : np.ndarray
     b : np.ndarray
+    rtol : None or float or dict of dtype -> float
+        The relative threshold. Default threshold will be used if set to ``None``.
+    atol : None or float or dict of dtype -> float
+        The absolute threshold. Default threshold will be used if set to ``None``.
     threshold : None or float
         The checking threshold. Default threshold will be used if set to ``None``.
     etol : None or float
         The error rate threshold. If etol is float, return true if error_rate < etol even if
         any error is found.
+    names : tuple of names, optional
+        The names used in error message when an exception occurs
+    equal_nan : boolean, optional
+        The flag determining how to treat NAN values in comparison
+    mismatches : tuple of mismatches
+        Maximum number of mismatches to be printed (mismatches[0]) and determine (mismatches[1])
     """
     etol = get_etol(etol)
     if etol > 0:
-        rtol = get_rtol(rtol)
-        atol = get_atol(atol)
+        rtol, atol = get_tols(a, b, rtol, atol)
         if isinstance(a, mx.nd.NDArray):
             a = a.asnumpy()
         if isinstance(b, mx.nd.NDArray):
@@ -679,7 +786,7 @@ def assert_almost_equal_with_err(a, b, rtol=None, atol=None, etol=None,
         equals = np.isclose(a, b, rtol=rtol, atol=atol)
         err = 1 - np.count_nonzero(equals) / equals.size
         if err > etol:
-            index, rel = find_max_violation(a, b, rtol, atol)
+            index, rel = _find_max_violation(a, b, rtol, atol)
             indexErr = index
             relErr = rel
 
@@ -697,7 +804,7 @@ def assert_almost_equal_with_err(a, b, rtol=None, atol=None, etol=None,
 
                 i += 1
                 if i <= mismatches[1] or mismatches[1] <= 0:
-                    index, rel = find_max_violation(aTmp, bTmp, rtol, atol)
+                    index, rel = _find_max_violation(aTmp, bTmp, rtol, atol)
                 else:
                     break
 
@@ -712,31 +819,6 @@ def assert_almost_equal_with_err(a, b, rtol=None, atol=None, etol=None,
         assert_almost_equal(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan)
 
 
-def almost_equal_ignore_nan(a, b, rtol=None, atol=None):
-    """Test that two NumPy arrays are almost equal (ignoring NaN in either array).
-    Combines a relative and absolute measure of approximate eqality.
-    If either the relative or absolute check passes, the arrays are considered equal.
-    Including an absolute check resolves issues with the relative check where all
-    array values are close to zero.
-
-    Parameters
-    ----------
-    a : np.ndarray
-    b : np.ndarray
-    rtol : None or float
-        The relative threshold. Default threshold will be used if set to ``None``.
-    atol : None or float
-        The absolute threshold. Default threshold will be used if set to ``None``.
-    """
-    a = np.copy(a)
-    b = np.copy(b)
-    nan_mask = np.logical_or(np.isnan(a), np.isnan(b))
-    a[nan_mask] = 0
-    b[nan_mask] = 0
-
-    return almost_equal(a, b, rtol, atol)
-
-
 def assert_almost_equal_ignore_nan(a, b, rtol=None, atol=None, names=('a', 'b')):
     """Test that two NumPy arrays are almost equal (ignoring NaN in either array).
     Combines a relative and absolute measure of approximate eqality.
@@ -776,14 +858,14 @@ def decorate(f):
         """Decorate a test case."""
         def wrapper(*args, **kwargs):
             """Wrapper for tests function."""
-            for _ in range(n):
+            for i in range(n):
                 try:
                     f(*args, **kwargs)
+                    return
                 except AssertionError as e:
-                    err = e
-                    continue
-                return
-            raise err
+                    if i == n-1:
+                        raise e
+                    mx.nd.waitall()
         return wrapper
     return decorate
 
@@ -1015,7 +1097,7 @@ def as_stype(var, stype, dtype):
     return approx_grads
 
 
-def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rtol=1e-2,
+def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=None, rtol=None,
                            atol=None, grad_nodes=None, use_forward_train=True, ctx=None,
                            grad_stype_dict=None, dtype=default_dtype()):
     """Verify an operation by checking backward pass via finite difference method.
@@ -1060,9 +1142,6 @@ def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rto
     [1] https://github.com/Theano/Theano/blob/master/theano/gradient.py
     """
     assert dtype in (np.float16, np.float32, np.float64)
-    # cannot use finite differences with small eps without high precision
-    if dtype in (np.float32, np.float16):
-        assert numeric_eps >= 1e-5
     if ctx is None:
         ctx = default_context()
 
@@ -1136,12 +1215,18 @@ def random_projection(shape):
     assert len(executor.outputs) == 1
 
     executor.forward(is_train=True)
+
+    eps = get_tolerance(executor.outputs[0], numeric_eps, default_numeric_eps())
+    # cannot use finite differences with small eps without high precision
+    if dtype in (np.float32, np.float16):
+        assert eps >= 1e-5
+
     executor.backward()
-    symbolic_grads = {k:executor.grad_dict[k].asnumpy() for k in grad_nodes}
+    symbolic_grads = executor.grad_dict
 
     numeric_gradients = numeric_grad(
         executor, location_npy, aux_states_npy,
-        eps=numeric_eps, use_forward_train=use_forward_train, dtype=dtype)
+        eps=eps, use_forward_train=use_forward_train, dtype=dtype)
 
     for name in grad_nodes:
         fd_grad = numeric_gradients[name]
@@ -1151,6 +1236,8 @@ def random_projection(shape):
             assert_almost_equal(fd_grad, sym_grad, rtol, atol,
                                 ("NUMERICAL_%s"%name, "BACKWARD_%s"%name))
         elif grad_req[name] == 'add':
+            if isinstance(sym_grad, mx.nd.NDArray):
+                sym_grad = sym_grad.asnumpy()
             assert_almost_equal(fd_grad, sym_grad - orig_grad, rtol, atol,
                                 ("NUMERICAL_%s"%name, "BACKWARD_%s"%name))
         elif grad_req[name] == 'null':
@@ -1160,7 +1247,7 @@ def random_projection(shape):
             raise ValueError("Invalid grad_req %s for argument %s"%(grad_req[name], name))
 
 
-def check_symbolic_forward(sym, location, expected, rtol=1E-4, atol=None,
+def check_symbolic_forward(sym, location, expected, rtol=None, atol=None,
                            aux_states=None, ctx=None, equal_nan=False,
                            dtype=default_dtype()):
     """Compares a symbol's forward results with the expected ones.
@@ -1236,14 +1323,14 @@ def check_symbolic_forward(sym, location, expected, rtol=1E-4, atol=None,
 
     executor.forward(is_train=False)
 
-    outputs = [x.asnumpy() for x in executor.outputs]
+    outputs = executor.outputs
     for output_name, expect, output in zip(sym.list_outputs(), expected, outputs):
         assert_almost_equal(expect, output, rtol, atol,
                             ("EXPECTED_%s"%output_name, "FORWARD_%s"%output_name),
                             equal_nan=equal_nan)
     return executor.outputs
 
-def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=None,
+def check_symbolic_backward(sym, location, out_grads, expected, rtol=None, atol=None,
                             aux_states=None, grad_req='write', ctx=None, grad_stypes=None,
                             equal_nan=False, dtype=default_dtype()):
     """Compares a symbol's backward results with the expected ones.
@@ -1361,7 +1448,7 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=
 
     executor.backward(out_grads)
 
-    grads = {k: v.asnumpy() for k, v in args_grad_data.items()}
+    grads = args_grad_data
 
     for name in expected:
         if grad_req[name] == 'write':
@@ -1369,7 +1456,8 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=
                                 ("EXPECTED_%s"%name, "BACKWARD_%s"%name),
                                 equal_nan=equal_nan)
         elif grad_req[name] == 'add':
-            assert_almost_equal(expected[name], grads[name] - args_grad_npy[name],
+            grad = grads[name].asnumpy() if isinstance(grads[name], mx.nd.NDArray) else grads[name]
+            assert_almost_equal(expected[name], grad - args_grad_npy[name],
                                 rtol, atol, ("EXPECTED_%s"%name, "BACKWARD_%s"%name),
                                 equal_nan=equal_nan)
         elif grad_req[name] == 'null':
@@ -1454,16 +1542,8 @@ def check_speed(sym, location=None, ctx=None, N=20, grad_req=None, typ="whole",
         raise ValueError('typ can only be "whole" or "forward".')
 
 
-def get_tolerance(rtol, ctx):
-    if 'atol' in ctx:
-        return ctx['atol']
-    if 'atol_mult' in ctx:
-        return ctx['atol_mult'] * rtol
-    return rtol
-
-
 def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
-                      arg_params=None, aux_params=None, tol=None,
+                      arg_params=None, aux_params=None, rtol=None, atol=None,
                       raise_on_err=True, ground_truth=None, equal_nan=False,
                       use_uniform=False, rand_type=np.float64):
     """Check symbol gives the same output for different running context
@@ -1478,6 +1558,20 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
         Standard deviation of the inner normal distribution. Used in initialization.
     grad_req : str or list of str or dict of str to str
         Gradient requirement.
+    arg_params : dict of input name -> input data
+        data to use for non-aux inputs
+    aux_params : dict of input name -> input data
+        data to use for aux inputs
+    rtol : float or dictionary dtype->float, optional
+        The relative error tolerance.
+    atol : float or dictionary dtype->float, optional
+        The absolute error tolerance.
+    raise_on_err : bool, optional, defaults to True
+        Should an error raise an exception (or just output exception message)
+    ground_truth : dict of output name -> data, optional
+        Provided ideal result to be compared against
+    equal_nan : bool, optional, defaults to False
+        Should nans be treated as equal in the comparison
     use_unifrom: bool
         Optional, When flag set to true,
         random input data generated follows uniform distribution,
@@ -1513,20 +1607,6 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
   'type_dict': {'concat_arg0': np.float32, 'concat_arg1': np.float32}}]
     >>> check_consistency(sym, ctx_list)
     """
-    if tol is None:
-        tol = {np.dtype(np.float16): 1e-1,
-               np.dtype(np.float32): 1e-3,
-               np.dtype(np.float64): 1e-5,
-               np.dtype(np.uint8): 0,
-               np.dtype(np.int32): 0,
-               np.dtype(np.int64): 0}
-    elif isinstance(tol, numbers.Number):
-        tol = {np.dtype(np.float16): tol,
-               np.dtype(np.float32): tol,
-               np.dtype(np.float64): tol,
-               np.dtype(np.uint8): tol,
-               np.dtype(np.int32): tol,
-               np.dtype(np.int64): tol}
 
     assert len(ctx_list) > 1
     if isinstance(sym, Symbol):
@@ -1544,10 +1624,16 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
 
     arg_params = {} if arg_params is None else arg_params
     aux_params = {} if aux_params is None else aux_params
-    for n, arr in exe_list[0].arg_dict.items():
+
+    # returns the least precise of two dtypes
+    def smaller_dtype(dt1, dt2):
+        return dt1 if dt2 is None or np.dtype(dt1).itemsize < np.dtype(dt2).itemsize else dt2
+
+    # It's important to assign random inputs in a deterministic order, for reproducibility.
+    for n, arr in _sorted_items(exe_list[0].arg_dict):
         if n not in arg_params:
             if use_uniform:
-                arg_params[n] = np.random.uniform(low=-0.92, high=0.92,
+                arg_params[n] = np.random.uniform(low=-0.92 * scale, high=0.92 * scale,
                                                   size=arr.shape).astype(rand_type)
             else:
                 arg_params[n] = np.random.normal(size=arr.shape,
@@ -1566,31 +1652,28 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
                 arr[:] = np.zeros(arr.shape, dtype=arr.dtype)
 
     dtypes = [np.dtype(exe.outputs[0].dtype) for exe in exe_list]
-    max_idx = np.argmax(dtypes)
+    # Select the ground truth as the first model having the highest precision output[0]
+    gt_idx = np.argmax(dtypes)
     gt = ground_truth
     if gt is None:
-        gt = exe_list[max_idx].output_dict.copy()
+        gt = exe_list[gt_idx].output_dict.copy()
         if grad_req != 'null':
-            gt.update(exe_list[max_idx].grad_dict)
+            gt.update(exe_list[gt_idx].grad_dict)
 
     # test
     for exe in exe_list:
         exe.forward(is_train=False)
 
     for i, exe in enumerate(exe_list):
-        if i == max_idx:
+        if i == gt_idx:
             continue
 
-        rtol = tol[dtypes[i]]
-        atol = get_tolerance(rtol, ctx_list[i])
         for name, arr in zip(output_names, exe.outputs):
-            # Previously, the cast was to dtypes[i], but symbol may be mixed-precision,
-            # so casting the ground truth to the actual output type seems more correct.
-            gtarr = gt[name].astype(arr.dtype)
+            gtarr = gt[name]
             try:
                 assert_almost_equal(arr, gtarr, rtol=rtol, atol=atol, equal_nan=equal_nan)
             except AssertionError as e:
-                print('Predict Err: ctx %d vs ctx %d at %s'%(i, max_idx, name))
+                print('Predict Err: ctx %d vs ctx %d at %s'%(i, gt_idx, name))
                 traceback.print_exc()
                 if raise_on_err:
                     raise e
@@ -1599,29 +1682,50 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
 
     # train
     if grad_req != 'null':
+        # Perform forward()
         for exe in exe_list:
             exe.forward(is_train=True)
-            exe.backward(exe.outputs)
+        # Use the first executor's output data, cast to the least precise dtype,
+        # as the gradient data to pass to all executor's backward() call.
+        least_precise_dtype = [out.dtype for out in exe_list[0].outputs]
+        for exe in exe_list:
+            least_precise_dtype = [smaller_dtype(out1.dtype, dt) \
+                                    for (out1, dt) in zip(exe.outputs, least_precise_dtype)]
+        golden_data_np = [out.astype(dt).asnumpy() \
+                          for (out, dt) in zip(exe_list[0].outputs, least_precise_dtype)]
+        # Perform backward()
+        for exe in exe_list:
+            out_grads = [mx.nd.array(golden_np, ctx=exe._ctx,
+                                     dtype=out.dtype).tostype(out.stype)
+                         for (golden_np, out) in zip(golden_data_np, exe.outputs)]
+            exe.backward(out_grads)
 
         for i, exe in enumerate(exe_list):
-            if i == max_idx:
+            if i == gt_idx:
                 continue
 
-            rtol = tol[dtypes[i]]
-            atol = get_tolerance(rtol, ctx_list[i])
             curr = zip(output_names + arg_names, exe.outputs + exe.grad_arrays)
             for name, arr in curr:
                 if gt[name] is None:
                     assert arr is None
                     continue
 
-                # Previous cast was to dtypes[i], but symbol may be mixed-precision,
-                # so casting the ground truth to the actual output type seems more correct.
-                gtarr = gt[name].astype(arr.dtype)
+                gtarr = gt[name]
                 try:
-                    assert_almost_equal(arr, gtarr, rtol=rtol, atol=atol, equal_nan=equal_nan)
+                    rt, at = rtol, atol
+                    # If the primary data i/o type is float16, then the tolerance used when
+                    # comparing a float32 input gradient (e.g. batchnorm gamma) should be float16.
+                    smaller_arr_dtype = smaller_dtype(arr.dtype, dtypes[i])
+                    smaller_gt_dtype = smaller_dtype(gtarr.dtype, dtypes[gt_idx])
+                    if smaller_arr_dtype != arr.dtype or \
+                       smaller_gt_dtype != gtarr.dtype:
+                        rt, at = get_tols(arr.astype(smaller_arr_dtype),
+                                          gtarr.astype(smaller_gt_dtype), rtol, atol)
+                    assert_almost_equal(arr, gtarr, rtol=rt, atol=at, equal_nan=equal_nan)
                 except AssertionError as e:
-                    print('Train Err: ctx %d vs ctx %d at %s'%(i, max_idx, name))
+                    print('Train Err: {} {} ctx {} vs {} {} ctx {} at {}'.format(
+                        np.dtype(arr.dtype).name, arr.ctx, i,
+                        np.dtype(gtarr.dtype).name, gtarr.ctx, gt_idx, name))
                     traceback.print_exc()
                     if raise_on_err:
                         raise e
diff --git a/src/operator/linalg.h b/src/operator/linalg.h
index 291e251f5cbc..3e82c6a2fad1 100644
--- a/src/operator/linalg.h
+++ b/src/operator/linalg.h
@@ -280,6 +280,14 @@ void linalg_batch_det_backward_helper(const Tensor<xpu, 3, DType>& LU,
                                       const DType zero_det,
                                       const mxnet::OpContext& ctx);
 
+#ifdef __CUDACC__
+#if CUDA_VERSION < 11000
+#define VERSION_ADJUSTED_TF32_MATH CUBLAS_DEFAULT_MATH
+#else
+#define VERSION_ADJUSTED_TF32_MATH CUBLAS_TF32_TENSOR_OP_MATH
+#endif
+#endif  // __CUDACC__
+
 #include "linalg_impl.h"
 
 #endif  // MXNET_OPERATOR_LINALG_H_
diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
index fd6800d184e4..47b54f6ac340 100644
--- a/src/operator/linalg_impl.h
+++ b/src/operator/linalg_impl.h
@@ -205,12 +205,15 @@ inline void linalg_gemm<gpu, float>(const Tensor<gpu, 2, float>& A,
 #else
   cublasDataType_t full_datatype = CUBLAS_DATA_FULL;
 #endif
+  auto handle = Stream<gpu>::GetBlasHandle(s);
+  cublasMath_t saved_math_mode = SetCublasMathMode(handle, VERSION_ADJUSTED_TF32_MATH);
   CUBLAS_CALL(cublasSgemmEx(
-      Stream<gpu>::GetBlasHandle(s), (tB ? CUBLAS_OP_T : CUBLAS_OP_N),
+      handle, (tB ? CUBLAS_OP_T : CUBLAS_OP_N),
       (tA ? CUBLAS_OP_T : CUBLAS_OP_N), C.size(1), C.size(0),
       (tB ? B.size(1) : B.size(0)), &alpha, B.dptr_, full_datatype, B.stride_,
       A.dptr_, full_datatype, A.stride_, &beta, C.dptr_, full_datatype,
-      C.stride_))
+      C.stride_));
+  CUBLAS_CALL(cublasSetMathMode(handle, saved_math_mode));
 }
 
 #else
@@ -228,13 +231,16 @@ void linalg_gemm_axis<gpu, DType>(const Tensor<gpu, 3, DType>& A, const Tensor<g
   using mshadow::gpu; \
   CHECK_NOTNULL(s); \
   linalg_check_batch_size(A.size(1), B.size(1), C.size(1)); \
-  CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
+  auto handle = Stream<gpu>::GetBlasHandle(s); \
+  cublasMath_t saved_math_mode = SetCublasMathMode(handle, VERSION_ADJUSTED_TF32_MATH); \
+  CUBLAS_CALL(cublas##fname(handle, \
                             (tB ? CUBLAS_OP_T : CUBLAS_OP_N), \
                             (tA ? CUBLAS_OP_T : CUBLAS_OP_N), \
                             C.size(2), C.size(0), (tB ? B.size(2) : B.size(0)), &alpha, \
                             B.dptr_, B.size(1)*B.stride_, B.stride_, \
                             A.dptr_, A.size(1)*A.stride_, A.stride_, &beta, \
                             C.dptr_, C.size(1)*C.stride_, C.stride_, A.size(1))) \
+  CUBLAS_CALL(cublasSetMathMode(handle, saved_math_mode)); \
 }
 LINALG_GPU_GEMM_AXIS(SgemmStridedBatched, float)
 LINALG_GPU_GEMM_AXIS(DgemmStridedBatched, double)
@@ -342,13 +348,22 @@ void linalg_gemm<gpu, mshadow::half::half_t>(const Tensor<gpu, 2, mshadow::half:
     linalg_check_batch_size(A.size(0), B.size(0), C.size(0)); \
     check_gemm(A[0], B[0], C[0], alpha, beta, tA, tB); \
     using namespace mshadow::cuda; \
-    CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
+    auto handle = Stream<gpu>::GetBlasHandle(s); \
+    cublasMath_t saved_math_mode = SetCublasMathMode(handle, VERSION_ADJUSTED_TF32_MATH); \
+    CUBLAS_CALL(cublas##fname(handle, \
                               (tB ? CUBLAS_OP_T : CUBLAS_OP_N), \
                               (tA ? CUBLAS_OP_T : CUBLAS_OP_N), \
                               C.size(2), C.size(1), (tB ? B.size(2) : B.size(1)), \
-                              &alpha, B.dptr_, B.stride_, B.size(1) * B.stride_, \
-                              A.dptr_,  A.stride_, A.size(1) * A.stride_, \
-                              &beta, C.dptr_, C.stride_, C.size(1) * C.stride_, A.size(0))) \
+                              &alpha, \
+                              B.dptr_, B.stride_, \
+                              static_cast<int64_t>(B.size(1) * B.stride_), \
+                              A.dptr_,  A.stride_, \
+                              static_cast<int64_t>(A.size(1) * A.stride_), \
+                              &beta, \
+                              C.dptr_, C.stride_, \
+                              static_cast<int64_t>(C.size(1) * C.stride_), \
+                              A.size(0))) \
+    CUBLAS_CALL(cublasSetMathMode(handle, saved_math_mode)); \
   }
 
   LINALG_GPU_BATCH_GEMM(DgemmStridedBatched, double)
@@ -373,7 +388,7 @@ void linalg_gemm<gpu, mshadow::half::half_t>(const Tensor<gpu, 2, mshadow::half:
 
       using namespace mshadow::cuda;
       auto cublas_math_mode =
-          use_tensor_ops ? CUBLAS_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
+          use_tensor_ops ? CUBLAS_TENSOR_OP_MATH : VERSION_ADJUSTED_TF32_MATH;
       auto previous_math_mode = SetCublasMathMode(blas_handle, cublas_math_mode);
 
       // cublasGemmStridedBatchedEx is only supported for GPU with architecture
@@ -414,6 +429,8 @@ void linalg_gemm<gpu, mshadow::half::half_t>(const Tensor<gpu, 2, mshadow::half:
     CHECK_NOTNULL(s); \
     linalg_check_batch_size(A.size(0), B.size(0), C.size(0)); \
     linalg_check_batch_size(A.size(2), B.size(2), C.size(2)); \
+    auto handle = Stream<gpu>::GetBlasHandle(s);                                           \
+    cublasMath_t saved_math_mode = SetCublasMathMode(handle, VERSION_ADJUSTED_TF32_MATH); \
     for (index_t i = 0; i < A.size(2); ++i) { \
       CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
           (tB ? CUBLAS_OP_T : CUBLAS_OP_N), \
@@ -423,6 +440,7 @@ void linalg_gemm<gpu, mshadow::half::half_t>(const Tensor<gpu, 2, mshadow::half:
           A.dptr_+i*A.stride_, A.size(2) * A.stride_, A.size(1)*A.size(2)*A.stride_, &beta, \
           C.dptr_+i*C.stride_, C.size(2) * C.stride_, C.size(1)*C.size(2)*C.stride_, A.size(0))) \
     }\
+    SetCublasMathMode(handle, saved_math_mode); \
   }
 
   LINALG_GPU_BATCH_GEMM_AXIS(SgemmStridedBatched, float)
diff --git a/src/operator/numpy/np_true_divide-inl.h b/src/operator/numpy/np_true_divide-inl.h
index 0bc60a08803e..e7a1c193d97f 100644
--- a/src/operator/numpy/np_true_divide-inl.h
+++ b/src/operator/numpy/np_true_divide-inl.h
@@ -58,14 +58,17 @@ void TrueDivideScalarCompute(const nnvm::NodeAttrs &attrs,
     });
   } else {
 #ifndef _WIN32
-    CHECK_EQ(outputs[0].type_flag_, kFloat32) << "true_divide only supports float32 output "
-                                                 "when input's dtype is "
-                                              << type_string(inputs[0].type_flag_);
-    MXNET_INT_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        Kernel<op_with_req<OP, Req>, xpu>::Launch(
-          s, data.Size(), out.dptr<float>(), data.dptr<DType>(),
-          static_cast<float>(alpha));
+    CHECK(out.type_flag_ == mshadow::kFloat32 || out.type_flag_ == mshadow::kFloat64)
+      << "true_divide only supports float32 and float64"
+         " output when input's dtype is "
+      << type_string(inputs[0].type_flag_);
+    MSHADOW_REAL_TYPE_SWITCH(out.type_flag_, ODType, {
+      MXNET_INT_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+          Kernel<op_with_req<OP, Req>, xpu>::Launch(
+            s, data.Size(), out.dptr<ODType>(), data.dptr<DType>(),
+            static_cast<ODType>(alpha));
+        });
       });
     });
 #else
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 42a2424c7d9b..52280bf898a5 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -50,10 +50,9 @@ def check_rnn_layer(layer):
         states = layer.begin_state(16)
         co, cs = layer(x, states)
 
-    # atol of 1e-6 required, as exposed by seed 2124685726
-    assert_almost_equal(go, co, rtol=1e-2, atol=1e-6)
+    assert_almost_equal(go, co)
     for g, c in zip(gs, cs):
-        assert_almost_equal(g, c, rtol=1e-2, atol=1e-6)
+        assert_almost_equal(g, c)
 
 
 @with_seed()
@@ -70,9 +69,9 @@ def check_rnn_layer_w_rand_inputs(layer):
         states = layer.begin_state(16)
         co, cs = layer(x, states)
 
-    assert_almost_equal(go, co, rtol=1e-2, atol=1e-6)
+    assert_almost_equal(go, co)
     for g, c in zip(gs, cs):
-        assert_almost_equal(g, c, rtol=1e-2, atol=1e-6)
+        assert_almost_equal(g, c)
 
 
 @with_seed()
@@ -481,6 +480,13 @@ def tensor_size(big_tensor_bytes):
     # This in the past has given cudnnFind() trouble when it needed to allocate similar I/O's
     # from the area carved out by the MXNET_GPU_MEM_POOL_RESERVE setting (by default 5%).
     (free_mem_bytes, total_mem_bytes) = mx.context.gpu_memory_info(ctx.device_id)
+    # This test needs to be 'qualified' for use with each new larger memory size
+    largest_supported_total_mem_GB = 32
+    if (total_mem_bytes > largest_supported_total_mem_GB * 1024 * 1024 * 1024):
+        sys.stderr.write(
+        ' bypassing test due to too-large global memory of size {} ... '.format(total_mem_bytes))
+        return
+
     start_size = tensor_size(0.20 * total_mem_bytes)
     num_trials = 10
     sys.stderr.write(
diff --git a/tests/python/gpu/test_gluon_model_zoo_gpu.py b/tests/python/gpu/test_gluon_model_zoo_gpu.py
index 6f559db62808..8d473f705a41 100644
--- a/tests/python/gpu/test_gluon_model_zoo_gpu.py
+++ b/tests/python/gpu/test_gluon_model_zoo_gpu.py
@@ -91,7 +91,7 @@ def test_inference():
             max_val = np.max(np.abs(cpu_out.asnumpy()))
             gpu_max_val = np.max(np.abs(gpu_out.asnumpy()))
             eprint(model_name + ": CPU " + str(max_val) + ", GPU " + str(gpu_max_val))
-            assert_almost_equal(cpu_out / max_val, gpu_out / gpu_max_val, rtol=1e-3, atol=1e-3)
+            assert_almost_equal(cpu_out / max_val, gpu_out / gpu_max_val)
 
 def get_nn_model(name):
     if "densenet" in name:
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 9378480d9af6..bcf906a92e44 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -26,6 +26,7 @@
 from nose.tools import assert_raises
 import scipy.sparse as sps
 import mxnet.ndarray.sparse as mxsps
+import itertools
 from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, assert_allclose
 from mxnet.base import MXNetError
 from mxnet import autograd
@@ -495,30 +496,20 @@ def test_batchnorm_with_type():
 
 
   # V2, 2D
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=False, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_2D)
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=False, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_2D)
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=True, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_2D)
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=True, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_2D)
+  bools = [False, True]
+  for fix_gamma, cudnn_off in itertools.product(bools, bools):
+      sym = mx.sym.BatchNorm(name='norm', fix_gamma=fix_gamma, cudnn_off=cudnn_off)
+      check_consistency(sym, ctx_list_v2_2D)
 
   # V2, 1D
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=False, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_1D)
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=False, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_1D)
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=True, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_1D)
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=True, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_1D)
-  #
-  # # V2, 3D
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=False, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_3D)
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=True, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_3D)
+  for fix_gamma, cudnn_off in itertools.product(bools, bools):
+      sym = mx.sym.BatchNorm(name='norm', fix_gamma=fix_gamma, cudnn_off=cudnn_off)
+      check_consistency(sym, ctx_list_v2_1D)
+
+  # V2, 3D
+  for fix_gamma, cudnn_off in itertools.product(bools, [True,]):
+      sym = mx.sym.BatchNorm(name='norm', fix_gamma=fix_gamma, cudnn_off=cudnn_off)
+      check_consistency(sym, ctx_list_v2_3D)
 
 
 @with_seed()
@@ -632,9 +623,9 @@ def test_convolution_with_type():
                np.dtype(np.float64): 1e-5,
                np.dtype(np.uint8): 0,
                np.dtype(np.int32): 0}
-    check_consistency(sym, ctx_list, tol=tol)
+    check_consistency(sym, ctx_list, rtol=tol, atol=tol)
     # test ability to turn off training on bias
-    check_consistency(sym, ctx_list, grad_req={'conv_data': 'write', 'conv_weight': 'write', 'conv_bias': 'null'}, tol=tol)
+    check_consistency(sym, ctx_list, grad_req={'conv_data': 'write', 'conv_weight': 'write', 'conv_bias': 'null'}, rtol=tol, atol=tol)
 
 
 # Apply N symbols against each of M contexts, checking that all NxM combinations match.
@@ -717,7 +708,6 @@ def test_conv_deconv_guards():
     # Test cases for convolution and deconvolution via strided fft.  Ensure that the framework
     # guards against problematic CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING in cuDNN [7.3.1,7.5)
     # see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_750.html#rel_750
-    tol = 1e-1
     for (op, opname) in [(mx.sym.Convolution, 'conv'), (mx.sym.Deconvolution, 'deconv')]:
         dataname = opname + '_data'
         ctx = {'ctx': mx.gpu(0), dataname: (32, 32, 64, 64), 'type_dict': {dataname: np.float32}}
@@ -732,7 +722,7 @@ def test_conv_deconv_guards():
             try:
                 sym = op(**test_case_args)
                 sym_no_cudnn = op(cudnn_off=True, **test_case_args)
-                check_consistency([sym, sym_no_cudnn], [ctx, ctx], tol=tol)
+                check_consistency([sym, sym_no_cudnn], [ctx, ctx], scale=0.1)
             except:
                 print('Test failure of mx.sym.{} with args: {}'.format(op.__name__, test_case_args))
                 raise
@@ -756,7 +746,7 @@ def _conv_with_num_streams(seed):
                                               cudnn_off=True, name='conv')
             try:
                 # tol can be pretty high- we're looking for a large diff due to garbaged workspace
-                check_consistency([sym, sym_no_cudnn], [ctx, ctx], tol=1e-2)
+                check_consistency([sym, sym_no_cudnn], [ctx, ctx], rtol=1e-2, atol=1e-2)
             except:
                 print('Failing conv size = {}'.format(size))
                 raise
@@ -778,20 +768,19 @@ def test_convolution_multiple_streams():
 @with_seed()
 def test_convolution_large_c():
     problematic_c = 64 * 1024
-    # The convolution accumulates many values, so set large tolerances.
-    tol = {np.dtype(np.float32): 1,
-           np.dtype(np.float64): 1}
+    # The convolution accumulates many values, so scale the input magnitude.
+    scale = 0.1
     def test_1D_with_width(width, grad_req):
         ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, width), 'type_dict': {'conv_data': np.float32}},
                     {'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, width), 'type_dict': {'conv_data': np.float64}}]
         sym = mx.sym.Convolution(layout='NCW', num_filter=8, kernel=(2,), name='conv')
-        check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req)
+        check_consistency([sym, sym], ctx_list, grad_req=grad_req, scale=scale)
 
     def test_2D_with_width(width, grad_req):
         ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, 2, width), 'type_dict': {'conv_data': np.float32}},
                     {'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, 2, width), 'type_dict': {'conv_data': np.float64}}]
         sym = mx.sym.Convolution(layout='NCHW', num_filter=4, kernel=(2,2), name='conv')
-        check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req)
+        check_consistency([sym, sym], ctx_list, grad_req=grad_req, scale=scale)
 
     # Run with different data tensor shapes to run cudnnFind() multiple times.
     # First, populate algo and op caches with models that always use cudnnFind() (req == 'write').
@@ -808,20 +797,19 @@ def test_2D_with_width(width, grad_req):
 @with_seed()
 def test_deconvolution_large_c():
     problematic_c = 64 * 1024
-    # The deconvolution accumulates many values, so set large tolerances.
-    tol = {np.dtype(np.float32): 1,
-           np.dtype(np.float64): 1}
+    # The deconvolution accumulates many values, so scale the input magnitude.
+    scale = 0.1
     def test_1D_with_width(width, grad_req):
         ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (1, 8, width), 'type_dict': {'deconv_data': np.float32}},
                     {'ctx': mx.gpu(0), 'deconv_data': (1, 8, width), 'type_dict': {'deconv_data': np.float64}}]
         sym = mx.sym.Deconvolution(layout='NCW', num_filter=problematic_c, kernel=(2,), name='deconv')
-        check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req)
+        check_consistency([sym, sym], ctx_list, grad_req=grad_req, scale=scale)
 
     def test_2D_with_width(width, grad_req):
         ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (1, 8, 2, width), 'type_dict': {'deconv_data': np.float32}},
                     {'ctx': mx.gpu(0), 'deconv_data': (1, 8, 2, width), 'type_dict': {'deconv_data': np.float64}}]
         sym = mx.sym.Deconvolution(layout='NCHW', num_filter=problematic_c, kernel=(2,2), name='deconv')
-        check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req)
+        check_consistency([sym, sym], ctx_list, grad_req=grad_req, scale=scale)
 
     # Run with different data tensor shapes to run cudnnFind() multiple times.
     # First, populate algo and op caches with models that always use cudnnFind() (req == 'write').
@@ -926,8 +914,8 @@ def test_deconvolution_with_type():
                np.dtype(np.float64): 1e-5,
                np.dtype(np.uint8): 0,
                np.dtype(np.int32): 0}
-    check_consistency(sym, ctx_list, tol=tol)
-    check_consistency(sym, ctx_list, tol=tol, grad_req="add")
+    check_consistency(sym, ctx_list, rtol=tol, atol=tol)
+    check_consistency(sym, ctx_list, rtol=tol, atol=tol, grad_req="add")
 
     # 2D deconvolution
     sym = mx.sym.Deconvolution(num_filter=2, kernel=(3,3), name='deconv')
@@ -942,8 +930,8 @@ def test_deconvolution_with_type():
                np.dtype(np.float64): 1e-5,
                np.dtype(np.uint8): 0,
                np.dtype(np.int32): 0}
-    check_consistency(sym, ctx_list, tol=tol)
-    check_consistency(sym, ctx_list, tol=tol, grad_req="add")
+    check_consistency(sym, ctx_list, rtol=tol, atol=tol)
+    check_consistency(sym, ctx_list, rtol=tol, atol=tol, grad_req="add")
 
 
 @with_seed()
@@ -1025,10 +1013,11 @@ def test_bilinear_sampler_with_type():
 def test_grid_generator_with_type():
     data = mx.sym.Variable('data')
     sym = mx.sym.GridGenerator(data=data, transform_type='affine', target_shape=(20, 20))
+    scale = 1
     ctx_list = [{'ctx': mx.gpu(0), 'data': (3, 6), 'type_dict': {'data': np.float32}},
                 {'ctx': mx.cpu(0), 'data': (3, 6), 'type_dict': {'data': np.float32}}]
-    check_consistency(sym, ctx_list)
-    check_consistency(sym, ctx_list, grad_req="add")
+    check_consistency(sym, ctx_list, scale=scale)
+    check_consistency(sym, ctx_list, scale=scale, grad_req="add")
     sym = mx.sym.GridGenerator(data=data, transform_type='warp', target_shape=(20, 20))
     ctx_list = [{'ctx': mx.gpu(0), 'data': (3, 2, 20, 20), 'type_dict': {'data': np.float32}},
                 {'ctx': mx.cpu(0), 'data': (3, 2, 20, 20), 'type_dict': {'data': np.float32}}]
@@ -1173,7 +1162,7 @@ def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, str
                                                                             pool_op))
             sym_list.append(sym)
 
-        check_consistency(sym_list, ctx_list, equal_nan=(not count_include_pad), tol=tol)
+        check_consistency(sym_list, ctx_list, equal_nan=(not count_include_pad), rtol=tol, atol=tol)
 
     def test_pooling_dim(dim, pool_type, dtype, pool_op_list, p_value=2, count_include_pad=True,
                          tol=None):
@@ -1331,7 +1320,7 @@ def test_flatten_slice_after_conv():
 
     ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (2, 16, 16, 16), 'type_dict': {'conv_data': np.float32}},
                 {'ctx': mx.cpu(0), 'conv_data': (2, 16, 16, 16), 'type_dict': {'conv_data': np.float32}}]
-    check_consistency(slice_sym, ctx_list)
+    check_consistency(slice_sym, ctx_list, scale=0.5)
 
 
 @with_seed()
@@ -1634,7 +1623,7 @@ def test_embedding_helper(data_types, weight_types, low_pad, high_pad):
                         'type_dict': {'embedding_data': data_type, 'embedding_weight': weight_type}})
             arg_params = {'embedding_data': np.random.randint(low=-low_pad, high=V+high_pad, size=(N,))}
             check_consistency(sym, ctx_list, grad_req={'embedding_data': 'null','embedding_weight': 'write'},
-                              arg_params=arg_params)
+                              arg_params=arg_params, scale=0.1)
 
     data_types = [np.float16, np.float32, np.float64, np.int32]
     weight_types = [np.float16, np.float32, np.float64]
@@ -1882,7 +1871,7 @@ def test_deformable_psroipooling_with_type():
                                'deformable_psroipool_trans': np.float16}},
                 ]
 
-    check_consistency(sym, ctx_list, scale=0.1, tol=tol,
+    check_consistency(sym, ctx_list, scale=0.1, rtol=tol, atol=tol,
                       grad_req={'deformable_psroipool_data': 'write',
                                 'deformable_psroipool_rois': 'null',
                                 'deformable_psroipool_trans': 'write'}, arg_params=arg_params)
@@ -1913,9 +1902,9 @@ def test_deformable_convolution_with_type():
                  'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
                 ]
 
-    check_consistency(sym, ctx_list, scale=0.1, tol=tol)
+    check_consistency(sym, ctx_list, scale=0.1, rtol=tol, atol=tol)
     # test ability to turn off training on bias
-    check_consistency(sym, ctx_list, scale=0.1, tol=tol,
+    check_consistency(sym, ctx_list, scale=0.1, rtol=tol, atol=tol,
                       grad_req={'deformable_conv_data': 'write',
                                 'deformable_conv_offset': 'write',
                                 'deformable_conv_weight': 'write',
@@ -1948,7 +1937,7 @@ def test_deformable_convolution_options():
                  'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
                 ]
     sym = mx.sym.contrib.DeformableConvolution(num_filter=3, kernel=(3,3), pad=(1,1), name='deformable_conv')
-    check_consistency(sym, ctx_list, scale=0.1, tol=tol)
+    check_consistency(sym, ctx_list, scale=0.1, rtol=tol, atol=tol)
 
     # Stride > 1
     ctx_list = [{'ctx': mx.gpu(0),
@@ -1969,7 +1958,7 @@ def test_deformable_convolution_options():
                  'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
                 ]
     sym = mx.sym.contrib.DeformableConvolution(num_filter=3, kernel=(3,3), stride=(2,2), name='deformable_conv')
-    check_consistency(sym, ctx_list, scale=0.1, tol=tol)
+    check_consistency(sym, ctx_list, scale=0.1, rtol=tol, atol=tol)
 
     # Dilate > 1
     ctx_list = [{'ctx': mx.gpu(0),
@@ -1990,7 +1979,7 @@ def test_deformable_convolution_options():
                  'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
                 ]
     sym = mx.sym.contrib.DeformableConvolution(num_filter=3, kernel=(3,3), dilate=(2,2), name='deformable_conv')
-    check_consistency(sym, ctx_list, scale=0.1, tol=tol)
+    check_consistency(sym, ctx_list, scale=0.1, rtol=tol, atol=tol)
 
     # Deformable group > 1
     ctx_list = [{'ctx': mx.gpu(0),
@@ -2011,7 +2000,7 @@ def test_deformable_convolution_options():
                  'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
                 ]
     sym = mx.sym.contrib.DeformableConvolution(num_filter=4, kernel=(3,3), num_deformable_group=2, name='deformable_conv')
-    check_consistency(sym, ctx_list, scale=0.1, tol=tol)
+    check_consistency(sym, ctx_list, scale=0.1, rtol=tol, atol=tol)
 
 
 @with_seed()
diff --git a/tests/python/unittest/common.py b/tests/python/unittest/common.py
index ab2d191f1360..8e4e2e35f0cc 100644
--- a/tests/python/unittest/common.py
+++ b/tests/python/unittest/common.py
@@ -214,11 +214,13 @@ def test_new(*args, **kwargs):
                 try:
                     orig_test(*args, **kwargs)
                 except:
-                    # With exceptions, repeat test_msg at INFO level to be sure it's seen.
-                    if log_level < logging.INFO:
-                        logger.info(test_msg)
+                    # With exceptions, repeat test_msg at WARNING level to be sure it's seen.
+                    if log_level < logging.WARNING:
+                        logger.warning(test_msg)
                     raise
                 finally:
+                    # Provide test-isolation for any test having this decorator
+                    mx.nd.waitall()
                     np.random.set_state(post_test_state)
         return test_new
     return test_helper
@@ -277,7 +279,7 @@ def setup_module():
         seed = np.random.randint(0, np.iinfo(np.int32).max)
     else:
         seed = int(module_seed_str)
-        logger.warn('*** module-level seed is set: all tests running deterministically ***')
+        logger.warning('*** module-level seed is set: all tests running deterministically ***')
     logger.info('Setting module np/mx/python random seeds, use MXNET_MODULE_SEED=%s to reproduce.', seed)
     np.random.seed(seed)
     mx.random.seed(seed)
@@ -285,7 +287,7 @@ def setup_module():
     # The MXNET_TEST_SEED environment variable will override MXNET_MODULE_SEED for tests with
     #  the 'with_seed()' decoration.  Inform the user of this once here at the module level.
     if os.getenv('MXNET_TEST_SEED') is not None:
-        logger.warn('*** test-level seed set: all "@with_seed()" tests run deterministically ***')
+        logger.warning('*** test-level seed set: all "@with_seed()" tests run deterministically ***')
 
 try:
     from tempfile import TemporaryDirectory
@@ -373,4 +375,4 @@ def run_in_spawned_process(func, env, *args):
         finally:
             os.environ.clear()
             os.environ.update(orig_environ)
-    return True
\ No newline at end of file
+    return True
diff --git a/tests/python/unittest/test_autograd.py b/tests/python/unittest/test_autograd.py
index 61955f034a71..a72af688764e 100644
--- a/tests/python/unittest/test_autograd.py
+++ b/tests/python/unittest/test_autograd.py
@@ -433,6 +433,7 @@ def check_grad_with_stype(array_stype, grad_stype, expected_stype):
             check_grad_with_stype(stype, grad_stype, grad_stype)
 
 @with_seed()
+@pytest.mark.garbage_expected
 def test_sparse_dot_grad():
     def check_sparse_dot_grad(rhs):
         lhs = rand_ndarray((2, 8), 'csr')
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 60fd526e16c7..6129c2892d05 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -23,7 +23,7 @@
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet.base import py_str, MXNetError
-from mxnet.test_utils import assert_almost_equal
+from mxnet.test_utils import assert_almost_equal, default_context
 from mxnet.util import is_np_array
 from mxnet.ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
 from mxnet.test_utils import use_np
@@ -897,7 +897,7 @@ def _syncParameters(bn1, bn2, ctx):
                             input2grad.asnumpy(), atol=atol, rtol=rtol)
 
     cfgs = [(1, False)]
-    num_gpus = mx.context.num_gpus()
+    num_gpus = 0 if default_context().device_type != 'gpu' else mx.context.num_gpus()
     batch_size = 24
     for i in range(1, num_gpus + 1):
         if batch_size % i == 0:
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 3a9bd9e93126..167d26e922e4 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -24,7 +24,7 @@
 import random
 import functools
 from nose.tools import assert_raises, raises
-from common import with_seed, assertRaises, TemporaryDirectory
+from common import with_seed, assertRaises, TemporaryDirectory, setup_module, teardown
 from mxnet.test_utils import almost_equal
 from mxnet.test_utils import assert_almost_equal, assert_exception
 from mxnet.test_utils import default_context
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 18b26579f740..67722047ddee 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -28,7 +28,7 @@
 from mxnet.test_utils import assert_almost_equal
 from mxnet.test_utils import use_np
 from mxnet.test_utils import is_op_runnable
-from common import assertRaises, with_seed
+from common import assertRaises, with_seed, random_seed
 from mxnet.numpy_dispatch_protocol import with_array_function_protocol, with_array_ufunc_protocol
 from mxnet.numpy_dispatch_protocol import _NUMPY_ARRAY_FUNCTION_LIST, _NUMPY_ARRAY_UFUNC_LIST
 
@@ -471,8 +471,8 @@ def _add_workload_linalg_cholesky():
     dtypes = (np.float32, np.float64)
 
     for shape, dtype in itertools.product(shapes, dtypes):
-        _np.random.seed(1)
-        a = _np.random.randn(*shape)
+        with random_seed(1):
+            a = _np.random.randn(*shape)
 
         t = list(range(len(shape)))
         t[-2:] = -1, -2
@@ -2705,7 +2705,6 @@ def _add_workload_unwrap():
     phase[3:] += np.pi
     phase_s = np.vstack((phase,phase))
     OpArgMngr.add_workload('unwrap', phase)
-    print(phase_s.shape)
     OpArgMngr.add_workload('unwrap', phase_s, axis=1)
 
 
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 475ff0243290..c5804c253744 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -31,7 +31,7 @@
 from mxnet.gluon import HybridBlock
 from mxnet.base import MXNetError
 from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray
-from mxnet.test_utils import check_numeric_gradient, use_np, collapse_sum_like
+from mxnet.test_utils import check_numeric_gradient, use_np, collapse_sum_like, effective_dtype
 from mxnet.test_utils import new_matrix_with_real_eigvals_nd
 from mxnet.test_utils import new_sym_matrix_with_real_eigvals_nd
 from common import assertRaises, with_seed
@@ -1488,15 +1488,18 @@ def _test_batchnorm_impl(shape, fix_gamma, cudnn_off, output_mean_var,
 
             running_mean = running_mean * momentum + \
                 data_mean_flat * (1 - momentum)
+
+            m = _np.prod(shape) / shape[axis]
+            # cudnn uses m-1 in the denominator of its sample variance calculation, not m
+            sample_var_adjust = 1.0 if cudnn_off or fix_gamma else m / (m-1)
             running_var = running_var * momentum + \
-                data_var_flat * (1 - momentum)
+                data_var_flat * sample_var_adjust * (1 - momentum)
 
             W = bn_gamma.reshape(expand_shape)
             dnx = ograd * W
             xsm = data - data_mean
             nd = 1.0 / np.sqrt(data_var + epsilon)
             nx = xsm * nd
-            m = _np.prod(shape) / shape[axis]
             dvar = np.sum(dnx * xsm, axis=reduce_axis, keepdims=True,
                                   ) * (-0.5) * np.power(nd, 3)
             dmean = -nd * np.sum(dnx, axis=reduce_axis, keepdims=True) - \
@@ -3951,7 +3954,7 @@ def hybrid_forward(self, F, mean, sigma):
     for ((shape1, shape2), out_shape) in zip(param_shape, output_shapes):
         mx_out = np.random.lognormal(np.zeros(shape1), np.ones(shape2), out_shape)
         np_out = _np.random.lognormal(np.zeros(shape1).asnumpy(), np.ones(shape2).asnumpy(), out_shape)
-        assert_almost_equal(mx_out.asnumpy().shape, np_out.shape)
+        assert mx_out.asnumpy().shape == np_out.shape
 
     def _test_lognormal_exception(sigma):
         output = np.random.lognormal(sigma=sigma).asnumpy()
@@ -4177,7 +4180,7 @@ def hybrid_forward(self, F, scale):
             with mx.autograd.record():
                 mx_out = test_rayleigh(scale)
             np_out = _np.random.rayleigh(scale = scale.asnumpy(), size = shape)
-            assert_almost_equal(np_out.shape, mx_out.shape)
+            assert np_out.shape == mx_out.shape
             mx_out.backward()
             assert scale.grad.shape == shape
             assert_almost_equal(scale.grad.asnumpy().sum(), mx_out.asnumpy().sum(), rtol=1e-3, atol=1e-5)
@@ -4185,7 +4188,7 @@ def hybrid_forward(self, F, scale):
     for shape in shapes:
         mx_out = np.random.rayleigh(np.array([1]), shape)
         np_out = _np.random.rayleigh(np.array([1]).asnumpy(), shape)
-        assert_almost_equal(mx_out.asnumpy().shape, np_out.shape)
+        assert mx_out.asnumpy().shape == np_out.shape
 
     def _test_rayleigh_exception(scale):
         output = np.random.rayleigh(scale=scale).asnumpy()
@@ -4218,7 +4221,7 @@ def hybrid_forward(self, F, scale):
             with mx.autograd.record():
                 mx_out = test_exponential_grad(scale)
             np_out = _np.random.exponential(scale = scale.asnumpy(), size = out_shape)
-            assert_almost_equal(np_out.shape, mx_out.shape)
+            assert np_out.shape == mx_out.shape
             mx_out.backward()
             assert scale.grad.shape == out_shape
             assert_almost_equal(scale.grad.asnumpy().sum(), mx_out.asnumpy().sum(), rtol=1e-3, atol=1e-5)
@@ -4956,6 +4959,8 @@ def check_svd(UT, L, V, data_np):
                 data_np = _np.random.uniform(-10.0, 10.0, shape)
                 data_np = _np.array(data_np, dtype=dtype)
                 data = np.array(data_np, dtype=dtype)
+                if effective_dtype(data) == np.dtype(np.float16):
+                    continue
                 data.attach_grad()
                 with mx.autograd.record():
                     ret = test_svd(data)
@@ -5211,7 +5216,7 @@ def check_solve(x, a_np, b_np):
             print(e)
         else:
             assert x.shape == x_expected.shape
-            assert_almost_equal(x.asnumpy(), x_expected, rtol=rtol, atol=atol)
+            assert_almost_equal(x, x_expected)
 
     def newInvertibleMatrix_2D(shape, max_cond=4):
         while 1:
@@ -5251,7 +5256,6 @@ def get_grad_b(A, X):
     nrhs = (-1, 0, 1, 2, 3)
     dtypes = ['float32', 'float64']
     for hybridize, shape, dtype, nrh in itertools.product([False, True], shapes, dtypes, nrhs):
-        rtol, atol =1e-2, 1e-4
         test_solve = TestSolve()
         if hybridize:
             test_solve.hybridize()
@@ -5285,8 +5289,8 @@ def get_grad_b(A, X):
                 mx.autograd.backward(mx_out)
                 b_backward_expected = get_grad_b(a.asnumpy(), mx_out.asnumpy())
                 a_backward_expected = -_np.matmul(b_backward_expected, _np.swapaxes(mx_out, -1, -2).asnumpy())
-                assert_almost_equal(a.grad.asnumpy(), a_backward_expected, rtol=rtol, atol=atol)
-                assert_almost_equal(b.grad.asnumpy(), b_backward_expected, rtol=rtol, atol=atol)
+                assert_almost_equal(a.grad, a_backward_expected)
+                assert_almost_equal(b.grad, b_backward_expected)
 
         # check imperative once again
         mx_out = np.linalg.solve(a, b)
@@ -5311,7 +5315,7 @@ def check_tensorinv(inv_a, a_np, ind):
             print(e)
         else:
             assert inv_a.shape == inv_a_expected.shape
-            assert_almost_equal(inv_a.asnumpy(), inv_a_expected, rtol=rtol, atol=atol)
+            assert_almost_equal(inv_a, inv_a_expected)
 
     def newInvertibleMatrix_2D(shape, max_cond=4):
         while 1:
@@ -5354,11 +5358,6 @@ def get_grad_A(A, ind):
     ]
     dtypes = ['float32', 'float64']
     for hybridize, shape, dtype, in itertools.product([False, True], shapes, dtypes):
-        rtol = 1e-3
-        atol = 1e-5
-        if dtype == 'float32':
-            rtol = 1e-2
-            atol = 1e-4
         ind = shape[0]
         test_tensorinv = TestTensorinv(ind=ind)
         if hybridize:
@@ -5386,7 +5385,7 @@ def get_grad_A(A, ind):
         if 0 not in mx_out.shape:
             mx.autograd.backward(mx_out)
             grad_A_expected = get_grad_A(a.asnumpy(), ind)
-            assert_almost_equal(a.grad.asnumpy(), grad_A_expected, rtol=rtol, atol=atol)
+            assert_almost_equal(a.grad, grad_A_expected)
 
     # check imperative once again
     mx_out = np.linalg.tensorinv(a, ind)
@@ -5439,7 +5438,7 @@ def check_tensorsolve(x, a_np, b_np, axes):
             print(e)
         else:
             assert x.shape == x_expected.shape
-            assert_almost_equal(x.asnumpy(), x_expected, rtol=rtol, atol=atol)
+            assert_almost_equal(x, x_expected)
 
     def shapeInfer(a_shape, b_shape, axes=None):
         # b_shape - Right-hand tensor shape, which can be of any shape.
@@ -5501,8 +5500,6 @@ def newInvertibleMatrix_2D(shape, max_cond=4):
     for hybridize in [True, False]:
         for dtype in dtypes:
             for a_shape, b_shape, axes in shapes:
-                rtol = 1e-2 if dtype == 'float32' else 1e-3
-                atol = 1e-4 if dtype == 'float32' else 1e-5
                 test_tensorsolve = TestTensorsolve(axes)
                 if hybridize:
                     test_tensorsolve.hybridize()
@@ -5539,8 +5536,8 @@ def newInvertibleMatrix_2D(shape, max_cond=4):
                     mx.autograd.backward(mx_out)
                     grad_a_expected, grad_b_expected = get_tensorsolve_backward(
                         a.asnumpy(), b.asnumpy(), mx_out.asnumpy(), a_axes, a_origin_axes, a_trans_shape)
-                    assert_almost_equal(a.grad.asnumpy(), grad_a_expected, rtol=rtol, atol=atol)
-                    assert_almost_equal(b.grad.asnumpy(), grad_b_expected, rtol=rtol, atol=atol)
+                    assert_almost_equal(a.grad, grad_a_expected)
+                    assert_almost_equal(b.grad, grad_b_expected)
 
                 # check imperative once again
                 mx_out = test_tensorsolve(a, b)
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 4e736e5dc0ab..edb3e6a5aa4c 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -814,21 +814,26 @@ def test_symbol_pow():
 
 @with_seed()
 def test_fully_connected():
+    # Create data of given shape as a uniform distribution centered on 0.0
+    def random_data(shape, dtype=np.float32):
+        return mx.nd.random.uniform(low=-0.5,
+                                    high=0.5, shape=shape, dtype=dtype)
     data = mx.sym.var("data")
     fc_weight = mx.sym.var("weight")
     fc_bias = mx.sym.var("bias")
     fc = mx.sym.FullyConnected(data=data, weight=fc_weight, bias=fc_bias, num_hidden=10, no_bias=False, name='fc')
-    data = mx.nd.random.uniform(shape=(5, 5, 5, 13), dtype=np.float32)
-    fc_weight = mx.nd.random.uniform(shape=(10, 325), dtype=np.float32)
-    fc_bias = mx.nd.random.uniform(shape=(10), dtype=np.float32)
-    fc_bias2 = mx.nd.random.uniform(shape=(10, 1), dtype=np.float32)
+
+    data = random_data(shape=(5, 5, 5, 13))
+    fc_weight = random_data(shape=(10, 325))
+    fc_bias = random_data(shape=(10))
+    fc_bias2 = random_data(shape=(10, 1))
+
     data_np = data.asnumpy().reshape(5, 325)
     fc_weight_np = np.transpose(fc_weight.asnumpy())
     fc_bias_np = fc_bias.asnumpy()
     res = np.dot(data_np, fc_weight_np) + fc_bias.asnumpy()
     check_symbolic_forward(fc, {'data': data_np, 'weight': fc_weight.asnumpy(), 'bias': fc_bias_np}, {'fc_output': res})
-    check_numeric_gradient(fc, {'data': data_np, 'weight': fc_weight.asnumpy(), 'bias': fc_bias_np},
-                           numeric_eps=1e-2, rtol=1e-4, atol=1e-2)
+    check_numeric_gradient(fc, {'data': data_np, 'weight': fc_weight.asnumpy(), 'bias': fc_bias_np})
     # TODO: Fix Bug #15032 when bias has ndim > 1
     #check_symbolic_forward(fc, {'data': data_np, 'weight': fc_weight.asnumpy(), 'bias': fc_bias2.asnumpy()}, {'fc_output': res})
 
@@ -1905,15 +1910,18 @@ def _test_batchnorm_impl(op_name, shape, fix_gamma, cudnn_off, output_mean_var,
 
             running_mean = running_mean * momentum + \
                 data_mean_flat * (1 - momentum)
+
+            m = np.prod(shape) / shape[axis]
+            # cudnn uses m-1 in the denominator of its sample variance calculation, not m
+            sample_var_adjust = 1.0 if cudnn_off or fix_gamma else m / (m-1)
             running_var = running_var * momentum + \
-                data_var_flat * (1 - momentum)
+                data_var_flat * sample_var_adjust * (1 - momentum)
 
             W = bn_gamma.reshape(expand_shape)
             dnx = ograd * W
             xsm = data - data_mean
             nd = 1.0 / mx.nd.sqrt(data_var + epsilon)
             nx = xsm * nd
-            m = np.prod(shape) / shape[axis]
             dvar = (dnx * xsm).sum(axis=axis, keepdims=True,
                                    exclude=True) * (-0.5) * mx.nd.power(nd, 3)
             dmean = -nd * dnx.sum(axis=axis, keepdims=True, exclude=True) - \
@@ -2848,13 +2856,13 @@ def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym,
                          args_grad={'a': grad_nd})
             net.forward(is_train=True)
 
-            equal_forward = almost_equal_ignore_nan(net.outputs[0].asnumpy(), sum_groundtruth, 1E-4, 1E-4)
-            assert equal_forward
+            # check forward
+            assert_almost_equal_ignore_nan(net.outputs[0].asnumpy(), sum_groundtruth, rtol=1e-4, atol=1e-4)
 
             net.backward(out_grads=mx.nd.array(outgrad_npy))
             bc_grad_groundtruth = np.broadcast_to(grad_groundtruth, grad_nd.shape)
-            equal_backward = almost_equal_ignore_nan(grad_nd.asnumpy(), bc_grad_groundtruth, 1E-4, 1E-4)
-            assert equal_backward
+            # check backward
+            assert_almost_equal_ignore_nan(grad_nd.asnumpy(), bc_grad_groundtruth, rtol=1e-4, atol=1e-4)
 
     test_none_axis = [True, False]
     for test_none in test_none_axis:
@@ -4504,7 +4512,7 @@ def get_large_matrix():
                 out_npy = gt_topk(dat=a_npy, axis=axis, ret_typ="value", k=a_npy.size, is_ascend=is_ascend)
             else:
                 out_npy = gt_topk(dat=a_npy, axis=axis, ret_typ="value", k=5, is_ascend=is_ascend)
-            check_numeric_gradient(b, location={'a': a_npy}, numeric_eps=1e-2, ctx=ctx)
+            check_numeric_gradient(b, location={'a': a_npy}, numeric_eps=1e-2, rtol=1e-2, ctx=ctx)
             check_symbolic_forward(b, location={'a': a_npy}, expected=[out_npy])
 
     b = mx.sym.topk(a, axis=1, is_ascend=is_ascend, ret_typ="indices", k=5)
@@ -4552,7 +4560,7 @@ def get_large_matrix():
                 for is_ascend in [True, False]:
                     b = mx.sym.topk(a, axis=axis, is_ascend=is_ascend, ret_typ="value", k=k)
                     out_npy = gt_topk(dat=a_npy, axis=axis, ret_typ="value", k=k, is_ascend=is_ascend)
-                    check_numeric_gradient(b, location={'a': a_npy}, numeric_eps=1e-2, ctx=ctx)
+                    check_numeric_gradient(b, location={'a': a_npy}, numeric_eps=1e-2, rtol=1e-2, ctx=ctx)
                     check_symbolic_forward(b, location={'a': a_npy}, expected=[out_npy])
 
         b = mx.sym.topk(a, axis=1, is_ascend=is_ascend, ret_typ="indices", k=5)
@@ -4716,7 +4724,7 @@ def test_grid_generator():
         # check forward
         exe.arg_dict['affine'][:] = np.array([[1.0,0,0,0,1.0,0]])
         exe.forward(is_train=True)
-        output = exe.outputs[0].asnumpy()
+        output = exe.outputs[0]
         output[0,0,:,:] = (output[0,0,:,:] + 1) * (target_shape[1] - 1) / 2.0
         output[0,1,:,:] = (output[0,1,:,:] + 1) * (target_shape[0] - 1) / 2.0
         xv, yv = np.meshgrid(np.arange(target_shape[0]), np.arange(target_shape[1]))
@@ -4731,7 +4739,7 @@ def test_grid_generator():
         tmp[1] = -1.0 + (np.arange(target_shape[0]*target_shape[1]) // target_shape[1]) * (2.0 / (target_shape[0]-1))
         tmp[2] = 1
         grad_est = np.dot(out_grad[0].reshape(2,target_shape[0]*target_shape[1]),tmp.T).reshape(1,6)
-        assert_almost_equal(exe.grad_dict['affine'], grad_est, rtol=1e-3, atol=1e-5)
+        assert_almost_equal(exe.grad_dict['affine'], grad_est)
         # check addto
         exe = grid.simple_bind(ctx=default_context(), affine=(1,6), grad_req='add')
         grid_grad_npy = np.random.normal(size=exe.grad_dict['affine'].shape)
@@ -4739,7 +4747,7 @@ def test_grid_generator():
         exe.arg_dict['affine'][:] = np.array([[1.0, 0, 0, 0, 1.0, 0]])
         exe.forward(is_train=True)
         exe.backward(mx.nd.array(out_grad))
-        assert_almost_equal(exe.grad_dict['affine'], grad_est + grid_grad_npy, rtol=1e-2, atol=1e-5)
+        assert_almost_equal(exe.grad_dict['affine'], grad_est + grid_grad_npy)
 
     # transform_type = warp
     test_case = [(12,21),(4,3),(6,12)]
@@ -5784,51 +5792,62 @@ def test_div_sqrt_dim():
     check_symbolic_forward(test, [data_tmp], [data_tmp / np.sqrt(data_tmp.shape[-1])])
 
 
+# helper function to identify inputs likely to fail check_numeric_gradient tol test
+# due to finite difference method inaccuracies or function discontuities at the origin
+def bad_input_finder(f, f_grad, dtype):
+    eps = default_numeric_eps()[np.dtype(dtype)]
+    rtol = default_rtols()[np.dtype(dtype)]
+    def expected_relative_error(x):
+        fd_gradient = (f(x+eps/2) - f(x-eps/2)) / eps
+        return abs(fd_gradient/f_grad(x) - 1)
+    def is_fd_problem_input(x):
+        return abs(x) < eps/2 or expected_relative_error(x) > rtol
+    return np.vectorize(is_fd_problem_input)
+
 @with_seed()
 def test_reciprocal_op():
-    eps = 2**(-11)
-    data_tmp = np.random.rand(3, 4) * 10 - 5
-    # Avoid possible division by 0 errors and finite difference method inaccuracies.
-    # Factor of 6 below set empirically, depends on eps.
-    # Issue exposed by seed 879579887.
-    # Replace problematic inputs with 1.0.
-    data_tmp[abs(data_tmp) < 6*eps] = 1.0
+    data_tmp = np.random.rand(3, 4).astype(np.float32) * 10 - 5
+
+    # Avoid possible division by 0 errors and finite difference method
+    # inaccuracies by replacing problem inputs with 1.0.
+    is_bad_input = bad_input_finder(np.reciprocal,
+                                    lambda x: -np.reciprocal(x)**2, np.float32)
+    data_tmp[is_bad_input(data_tmp)] = 1.0
     data = mx.symbol.Variable('data')
     test = mx.sym.reciprocal(data)
 
-    check_numeric_gradient(test, [data_tmp], numeric_eps = eps)
+    check_numeric_gradient(test, [data_tmp])
     check_symbolic_forward(test, [data_tmp], [np.reciprocal(data_tmp)])
 
 
 @with_seed()
 def test_cbrt_op():
-    eps = 2**(-11)
-    data_tmp = np.random.rand(3, 4) * 10 - 5
-    # Avoid finite difference method inaccuracies due to infinite gradient at the origin.
-    # Factor of 4 below set empirically, depends on eps.
-    # Issue exposed by seed 553872106.
-    # Replace problematic inputs with 1.0.
-    data_tmp[abs(data_tmp) < 4*eps] = 1.0
+    data_tmp = np.random.rand(3, 4).astype(np.float32) * 10 - 5
+
+    # Avoid possible division by 0 errors and finite difference method
+    # inaccuracies by replacing problem inputs with 1.0.
+    is_bad_input = bad_input_finder(np.cbrt,
+                                    lambda x: 1./(3 * np.cbrt(x)**2), np.float32)
+    data_tmp[is_bad_input(data_tmp)] = 1.0
     data = mx.symbol.Variable('data')
     test = mx.sym.cbrt(data)
-
-    check_numeric_gradient(test, [data_tmp], numeric_eps=eps)
+    check_numeric_gradient(test, [data_tmp])
     check_symbolic_forward(test, [data_tmp], [np.cbrt(data_tmp)])
 
 
 @with_seed()
 def test_rcbrt_op():
-    eps = 2**(-11)
-    data_tmp = np.random.rand(3, 4) * 10 - 5
-    # Avoid possible division by 0 errors and finite difference method inaccuracies.
-    # Factor of 4 below set empirically, depends on eps.
-    # Issue exposed by seed 788174893.
-    # Replace problematic inputs with 1.0.
-    data_tmp[abs(data_tmp) < 4*eps] = 1.0
+    data_tmp = np.random.rand(3, 4).astype(np.float32) * 10 - 5
+
+    # Avoid possible division by 0 errors and finite difference method
+    # inaccuracies by replacing problem inputs with 1.0.
+    is_bad_input = bad_input_finder(lambda x: 1./np.cbrt(x),
+                                    lambda x: -1./(3 * np.cbrt(x)**4), np.float32)
+    data_tmp[is_bad_input(data_tmp)] = 1.0
     data = mx.symbol.Variable('data')
     test = mx.sym.rcbrt(data)
 
-    check_numeric_gradient(test, [data_tmp], numeric_eps = eps)
+    check_numeric_gradient(test, [data_tmp])
     check_symbolic_forward(test, [data_tmp], [1/np.cbrt(data_tmp)])
 
 
@@ -6237,7 +6256,7 @@ def test_deformable_convolution():
                         # By now we only have gpu implementation
                         if default_context().device_type == 'gpu':
                             check_numeric_gradient(op, [im_data, offset_data, weight, bias], rtol=rtol, atol=atol,
-                                                   grad_nodes=grad_nodes, ctx=mx.gpu(0))
+                                                   grad_nodes=grad_nodes, ctx=mx.gpu(0), numeric_eps=1.0/64)
 
 
 def _validate_sample_location(input_rois, input_offset, spatial_scale, pooled_w, pooled_h, sample_per_part, part_size, output_dim, num_classes, trans_std, feat_h, feat_w):
@@ -6330,10 +6349,11 @@ def test_deformable_psroipooling():
                                                grad_nodes=grad_nodes, ctx=mx.gpu(0))
 
 
-def _gemm_test_helper(dtype, grad_check, rtol_fw = 1e-7, atol_fw = 1e-9):
-    num_eps = 1e-6
-    rtol_bw = 1e-5
-    atol_bw = 1e-6
+def _gemm_test_helper(dtype, grad_check, rtol_fw = None, atol_fw = None,
+                                         rtol_bw = None, atol_bw = None, num_eps = None):
+    def np_random_data(shape, dtype=np.float32):
+        return np.random.uniform(low=-0.5,
+                                 high=0.5, size=shape).astype(dtype)
 
     data1 = mx.symbol.Variable('data1')
     data2 = mx.symbol.Variable('data2')
@@ -6352,10 +6372,10 @@ def _gemm_test_helper(dtype, grad_check, rtol_fw = 1e-7, atol_fw = 1e-9):
     shape2 = (3, 2)
     shape3 = (3, 3)
     shape4 = (2, 2)
-    data_in1 = np.random.uniform(1, 10, shape1).astype(dtype)
-    data_in2 = np.random.uniform(1, 10, shape2).astype(dtype)
-    data_in3 = np.random.uniform(1, 10, shape3).astype(dtype)
-    data_in4 = np.random.uniform(1, 10, shape4).astype(dtype)
+    data_in1 = np_random_data(shape1, dtype)
+    data_in2 = np_random_data(shape2, dtype)
+    data_in3 = np_random_data(shape3, dtype)
+    data_in4 = np_random_data(shape4, dtype)
     # Check all transpositions of gemm operator.
     data_in1_t = np.transpose(data_in1)
     data_in2_t = np.transpose(data_in2)
@@ -6462,10 +6482,10 @@ def _gemm_test_helper(dtype, grad_check, rtol_fw = 1e-7, atol_fw = 1e-9):
 def test_gemm():
     _gemm_test_helper(np.float64, True)
     os.environ["MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION"] = "0"
-    _gemm_test_helper(np.float32, False, rtol_fw = 1e-5, atol_fw = 1e-7)
+    _gemm_test_helper(np.float32, True)
     if default_context().device_type == 'gpu':
         os.environ["MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION"] = "1"
-        _gemm_test_helper(np.float32, False, rtol_fw = 2e-5, atol_fw = 2e-7)
+        _gemm_test_helper(np.float32, True)
         os.environ["MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION"] = "0"
 
 # Helper functions for test_laop
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index 4c4e3dbdfc51..cb93fe1b04f0 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -1633,6 +1633,7 @@ def test_fallback(func_name, axis=0, keepdims=True, exclude=True):
 
 
 @with_seed()
+@pytest.mark.serial
 def test_sparse_square_sum():
     dim0 = 30
     dim1 = 30

From efd25d4c8b3f50120aba8fdf0eb42606e3876988 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 11 Sep 2020 19:01:41 -0700
Subject: [PATCH 03/11] Fix test_gluon_data.py:test_dataloader_context when run
 on 1-GPU system.

---
 tests/python/unittest/test_gluon_data.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index c3ae2de41722..ef27a7fd35f5 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -278,11 +278,11 @@ def test_multi_worker_dataloader_release_pool():
         del D
 
 
+@with_seed()
 def test_dataloader_context():
     X = np.random.uniform(size=(10, 20))
     dataset = gluon.data.ArrayDataset(X)
     default_dev_id = 0
-    custom_dev_id = 1
 
     # use non-pinned memory
     loader1 = gluon.data.DataLoader(dataset, 8)
@@ -294,11 +294,15 @@ def test_dataloader_context():
     for _, x in enumerate(loader2):
         assert x.context == context.cpu_pinned(default_dev_id)
 
-    # use pinned memory with custom device id
-    loader3 = gluon.data.DataLoader(dataset, 8, pin_memory=True,
-                                    pin_device_id=custom_dev_id)
-    for _, x in enumerate(loader3):
-        assert x.context == context.cpu_pinned(custom_dev_id)
+    if mx.context.num_gpus() <= 1:
+        print('Bypassing custom_dev_id pinned mem test on system with < 2 gpus.')
+    else:
+        custom_dev_id = 1
+        # use pinned memory with custom device id
+        loader3 = gluon.data.DataLoader(dataset, 8, pin_memory=True,
+                                        pin_device_id=custom_dev_id)
+        for _, x in enumerate(loader3):
+            assert x.context == context.cpu_pinned(custom_dev_id)
 
 def batchify(a):
     return a

From e63299bb8f87ba32d92afae5c8d17a28aba1aaa0 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 11 Sep 2020 19:35:09 -0700
Subject: [PATCH 04/11] Remove pytest decorators introduced in error

---
 tests/python/unittest/test_autograd.py        | 1 -
 tests/python/unittest/test_sparse_operator.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tests/python/unittest/test_autograd.py b/tests/python/unittest/test_autograd.py
index a72af688764e..61955f034a71 100644
--- a/tests/python/unittest/test_autograd.py
+++ b/tests/python/unittest/test_autograd.py
@@ -433,7 +433,6 @@ def check_grad_with_stype(array_stype, grad_stype, expected_stype):
             check_grad_with_stype(stype, grad_stype, grad_stype)
 
 @with_seed()
-@pytest.mark.garbage_expected
 def test_sparse_dot_grad():
     def check_sparse_dot_grad(rhs):
         lhs = rand_ndarray((2, 8), 'csr')
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index cb93fe1b04f0..4c4e3dbdfc51 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -1633,7 +1633,6 @@ def test_fallback(func_name, axis=0, keepdims=True, exclude=True):
 
 
 @with_seed()
-@pytest.mark.serial
 def test_sparse_square_sum():
     dim0 = 30
     dim1 = 30

From 8c0e7d975810557bd127af53cda5f17381f5acfe Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 11 Sep 2020 19:54:56 -0700
Subject: [PATCH 05/11] Fix test_forward.py:test_consistency

---
 tests/python/gpu/test_forward.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/gpu/test_forward.py b/tests/python/gpu/test_forward.py
index 02b0256024d3..2572eda31c40 100644
--- a/tests/python/gpu/test_forward.py
+++ b/tests/python/gpu/test_forward.py
@@ -74,7 +74,7 @@ def test_consistency(dump=False):
     ctx_list = [{'ctx': mx.gpu(0), 'data': data.shape, 'type_dict': {'data': data.dtype}},
                 {'ctx': mx.cpu(0), 'data': data.shape, 'type_dict': {'data': data.dtype}}]
     gt = check_consistency(sym, ctx_list, arg_params=arg_params, aux_params=aux_params,
-                           tol=1e-3, grad_req='null', raise_on_err=False, ground_truth=gt)
+                           rtol=1e-3, atol=1e-3, grad_req='null', raise_on_err=False, ground_truth=gt)
     if dump:
         np.savez('data/inception-v3-dump.npz', **{n: a.asnumpy() for n, a in gt.items()})
 

From 9714e23e8a22a263895a48f28cc2d420a516bde9 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sat, 12 Sep 2020 13:21:17 -0700
Subject: [PATCH 06/11] Fix test_numpy_op.py tests

---
 tests/python/unittest/test_numpy_op.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index c5804c253744..75dccfd577b6 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -4112,8 +4112,8 @@ def _test_random_beta_range(output):
         mx_out = test_random_beta(mx_data, mx_data)
         mx_out_imperative = mx.np.random.beta(mx_data, mx_data, size=param_shape, dtype=out_dtype)
 
-        assert_almost_equal(np_out.shape, mx_out.shape)
-        assert_almost_equal(np_out.shape, mx_out_imperative.shape)
+        assert np_out.shape == mx_out.shape
+        assert np_out.shape == mx_out_imperative.shape
         assert _test_random_beta_range(mx_out.asnumpy()) == True
         assert _test_random_beta_range(mx_out_imperative.asnumpy()) == True
 
@@ -4153,8 +4153,8 @@ def hybrid_forward(self, F, df):
         mx_out = test_random_chisquare(mx_df)
         mx_out_imperative = mx.np.random.chisquare(mx_df, size=param_shape, dtype=out_dtype)
 
-        assert_almost_equal(np_out.shape, mx_out.shape)
-        assert_almost_equal(np_out.shape, mx_out_imperative.shape)
+        assert np_out.shape == mx_out.shape
+        assert np_out.shape == mx_out_imperative.shape
 
 
 @with_seed()
@@ -4959,7 +4959,7 @@ def check_svd(UT, L, V, data_np):
                 data_np = _np.random.uniform(-10.0, 10.0, shape)
                 data_np = _np.array(data_np, dtype=dtype)
                 data = np.array(data_np, dtype=dtype)
-                if effective_dtype(data) == np.dtype(np.float16):
+                if effective_dtype(data) == _np.dtype(_np.float16):
                     continue
                 data.attach_grad()
                 with mx.autograd.record():

From 5bd44dbead582cb0b4ee61faf5b3f83e8c0b8a15 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sat, 12 Sep 2020 13:24:29 -0700
Subject: [PATCH 07/11] Improve test seeding in test_numpy_interoperablity.py
 (#18762)

---
 .../unittest/test_numpy_interoperability.py   | 29 ++++++++++++-------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 67722047ddee..fd8abf1849be 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -28,7 +28,7 @@
 from mxnet.test_utils import assert_almost_equal
 from mxnet.test_utils import use_np
 from mxnet.test_utils import is_op_runnable
-from common import assertRaises, with_seed, random_seed
+from common import assertRaises, with_seed, random_seed, setup_module, teardown
 from mxnet.numpy_dispatch_protocol import with_array_function_protocol, with_array_ufunc_protocol
 from mxnet.numpy_dispatch_protocol import _NUMPY_ARRAY_FUNCTION_LIST, _NUMPY_ARRAY_UFUNC_LIST
 
@@ -58,8 +58,15 @@ def add_workload(name, *args, **kwargs):
 
     @staticmethod
     def get_workloads(name):
+        if OpArgMngr._args == {}:
+            _prepare_workloads()
         return OpArgMngr._args.get(name, None)
 
+    @staticmethod
+    def randomize_workloads():
+        # Force a new _prepare_workloads(), which will be based on new random numbers
+        OpArgMngr._args = {}
+
 
 def _add_workload_all():
     # check bad element in all positions
@@ -470,8 +477,8 @@ def _add_workload_linalg_cholesky():
     shapes = [(1, 1), (2, 2), (3, 3), (50, 50), (3, 10, 10)]
     dtypes = (np.float32, np.float64)
 
-    for shape, dtype in itertools.product(shapes, dtypes):
-        with random_seed(1):
+    with random_seed(1):
+        for shape, dtype in itertools.product(shapes, dtypes):
             a = _np.random.randn(*shape)
 
         t = list(range(len(shape)))
@@ -2979,9 +2986,6 @@ def _prepare_workloads():
     _add_workload_vander()
 
 
-_prepare_workloads()
-
-
 def _get_numpy_op_output(onp_op, *args, **kwargs):
     onp_args = [arg.asnumpy() if isinstance(arg, np.ndarray) else arg for arg in args]
     onp_kwargs = {k: v.asnumpy() if isinstance(v, np.ndarray) else v for k, v in kwargs.items()}
@@ -2993,7 +2997,7 @@ def _get_numpy_op_output(onp_op, *args, **kwargs):
     return onp_op(*onp_args, **onp_kwargs)
 
 
-def _check_interoperability_helper(op_name, *args, **kwargs):
+def _check_interoperability_helper(op_name, rel_tol, abs_tol, *args, **kwargs):
     strs = op_name.split('.')
     if len(strs) == 1:
         onp_op = getattr(_np, op_name)
@@ -3009,11 +3013,11 @@ def _check_interoperability_helper(op_name, *args, **kwargs):
         assert type(out) == type(expected_out)
         for arr, expected_arr in zip(out, expected_out):
             if isinstance(arr, np.ndarray):
-                assert_almost_equal(arr.asnumpy(), expected_arr, rtol=1e-3, atol=1e-4, use_broadcast=False, equal_nan=True)
+                assert_almost_equal(arr.asnumpy(), expected_arr, rtol=rel_tol, atol=abs_tol, use_broadcast=False, equal_nan=True)
             else:
                 _np.testing.assert_equal(arr, expected_arr)
     elif isinstance(out, np.ndarray):
-        assert_almost_equal(out.asnumpy(), expected_out, rtol=1e-3, atol=1e-4, use_broadcast=False, equal_nan=True)
+        assert_almost_equal(out.asnumpy(), expected_out, rtol=rel_tol, atol=abs_tol, use_broadcast=False, equal_nan=True)
     elif isinstance(out, _np.dtype):
         _np.testing.assert_equal(out, expected_out)
     else:
@@ -3025,6 +3029,7 @@ def _check_interoperability_helper(op_name, *args, **kwargs):
 
 
 def check_interoperability(op_list):
+    OpArgMngr.randomize_workloads()
     for name in op_list:
         if name in _TVM_OPS and not is_op_runnable():
             continue
@@ -3033,13 +3038,17 @@ def check_interoperability(op_list):
         if name in ['full_like', 'zeros_like', 'ones_like'] and \
                 StrictVersion(platform.python_version()) < StrictVersion('3.0.0'):
             continue
+        default_tols = (1e-3, 1e-4)
+        tols = {'linalg.tensorinv': (1e-2, 5e-3),
+                'linalg.solve':     (1e-3, 5e-2)}
+        (rel_tol, abs_tol) = tols.get(name, default_tols)
         print('Dispatch test:', name)
         workloads = OpArgMngr.get_workloads(name)
         assert workloads is not None, 'Workloads for operator `{}` has not been ' \
                                       'added for checking interoperability with ' \
                                       'the official NumPy.'.format(name)
         for workload in workloads:
-            _check_interoperability_helper(name, *workload['args'], **workload['kwargs'])
+            _check_interoperability_helper(name, rel_tol, abs_tol, *workload['args'], **workload['kwargs'])
 
 
 @with_seed()

From abba2aa35dc730fbc61f53a08232b80027c67c0d Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sat, 12 Sep 2020 17:27:59 -0700
Subject: [PATCH 08/11] Fix test_numpy_op.py:test_np_random_{beta,chisquare}

---
 tests/python/unittest/test_numpy_op.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 75dccfd577b6..11f851a2cd50 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -4096,7 +4096,8 @@ def _test_random_beta_range(output):
         smaller_than_one = _np.all(output < 1)
         return bigger_than_zero and smaller_than_one
 
-    shape_list = [(), (1,), (2, 3), (4, 0, 5), 6, (7, 8), None]
+    # Starting with numpy 1.19.0: "Output size () is not compatible with broadcast dimensions of inputs (1,)."
+    shape_list = [(1,), (2, 3), (4, 0, 5), 6, (7, 8), None]
     # since fp16 might incur precision issue, the corresponding test is skipped
     dtype_list = [np.float32, np.float64]
     hybridize_list = [False, True]
@@ -4135,7 +4136,8 @@ def __init__(self, size=None, dtype=None, ctx=None):
         def hybrid_forward(self, F, df):
             return F.np.random.chisquare(df, size=self._size, dtype=self._dtype, ctx=self._ctx)
 
-    shape_list = [(), (1,), (2, 3), (4, 0, 5), 6, (7, 8), None]
+    # Starting with numpy 1.19.0: "Output size () is not compatible with broadcast dimensions of inputs (1,)."
+    shape_list = [(1,), (2, 3), (4, 0, 5), 6, (7, 8), None]
 
     dtype_list = [np.float16, np.float32, np.float64]
     hybridize_list = [False, True]

From a11af29bf83741b69b33caa1c6988130426b024d Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sun, 13 Sep 2020 13:11:02 -0700
Subject: [PATCH 09/11] Reduce problem sizes with
 test_optimizer.py:test_multilamb

---
 tests/python/unittest/test_optimizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 6d7cf40f29f7..4c4285d77c0a 100755
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -504,10 +504,10 @@ def test_multilamb():
     opt1 = PyLAMB
     opt2 = mx.optimizer.LAMB
 
-    # shapes as Bert-large
-    dims_x = [1024, 4096, 1024, 1024]
-    dims_y = [1, 1, 1024, 4096]
-    dims_occurrences = [9, 1, 4, 2]
+    dims_x = [1024, 4096, 1024]
+    dims_y = [1, 1, 1024]
+    dims_occurrences = [2, 1, 2]
+
     nlayers = 4 # 24
     # extra_dims_x=[30522, 512, 30522]
     # extra_dims_y=[1, 1024, 1024]

From ed5c287ba43475f71734415c92e279bf8b52bb0b Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sun, 13 Sep 2020 16:03:18 -0700
Subject: [PATCH 10/11] Skip test_gluon_gpu.py:test_fused_{lstm,gpu}_layer, fix
 test_rnn_cells, for fp16 contexts

---
 python/mxnet/test_utils.py              |  2 +-
 tests/python/unittest/test_gluon_rnn.py | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 9e544547b1a1..3e068604954f 100755
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -119,7 +119,7 @@ def effective_dtype(dat):
     def is_TF32_enabled(ctx):
         try:
             return (ctx.device_type == 'gpu' and
-                    get_cuda_compute_capability(ctx) == 80 and
+                    get_cuda_compute_capability(ctx) >= 80 and
                     os.environ.get('NVIDIA_TF32_OVERRIDE') != '0')
         except:  # pylint: disable=bare-except
             return False
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index 6f9308b12cea..0c34d55971d0 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -23,7 +23,7 @@
 from functools import partial
 from numpy.testing import assert_allclose
 import unittest
-from mxnet.test_utils import almost_equal, assert_almost_equal
+from mxnet.test_utils import almost_equal, assert_almost_equal, effective_dtype
 from common import assert_raises_cudnn_not_satisfied, with_seed
 
 
@@ -445,13 +445,13 @@ def check_rnn_forward(layer, inputs, deterministic=True):
         out.backward()
 
     if isinstance(inputs, mx.nd.NDArray):
-        input_grads = inputs.grad.asnumpy()
+        input_grads = inputs.grad
     else:
         input_grads = np.stack([x.grad.asnumpy() for x in inputs], axis=1)
 
     if deterministic:
-        mx.test_utils.assert_almost_equal(np_out, out.asnumpy(), rtol=1e-3, atol=1e-5)
-        mx.test_utils.assert_almost_equal(np_dx, input_grads, rtol=1e-3, atol=1e-5)
+        mx.test_utils.assert_almost_equal(np_out, out)
+        mx.test_utils.assert_almost_equal(np_dx, input_grads)
 
 
 def test_rnn_cells():
@@ -746,6 +746,9 @@ def check_rnn_bidir_layer_gradients(mode, input_size, hidden_size, num_layers, l
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_fused_lstm_layer():
+    if effective_dtype(mx.nd.array([1.,])) == np.float16:
+        print('Skipping test: effective dtype for this context is float16.')
+        return
     input_sizes = [8]
     hidden_sizes = [8, 16]
     num_layers = [1, 2, 3, 4]
@@ -758,6 +761,9 @@ def test_fused_lstm_layer():
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_fused_gru_layer():
+    if effective_dtype(mx.nd.array([1.,])) == np.float16:
+        print('Skipping test: effective dtype for this context is float16.')
+        return
     input_sizes = [8]
     hidden_sizes = [8, 16]
     num_layers = [1, 2, 3, 4]

From cf8d091807b1ab0318ad4a7d01177d39aa2f0f12 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Wed, 16 Sep 2020 23:32:37 -0700
Subject: [PATCH 11/11] Trigger CI