diff --git a/benchmark/python/ffi/benchmark_ffi.py b/benchmark/python/ffi/benchmark_ffi.py
index 67701020b205..6da4cd39d2a9 100644
--- a/benchmark/python/ffi/benchmark_ffi.py
+++ b/benchmark/python/ffi/benchmark_ffi.py
@@ -65,6 +65,7 @@ def prepare_workloads():
     OpArgMngr.add_workload("linalg.eigh", pool['3x3'])
     OpArgMngr.add_workload("linalg.det", pool['3x3'])
     OpArgMngr.add_workload("linalg.slogdet", pool['3x3'])
+    OpArgMngr.add_workload("linalg.matrix_rank", pool['3x3'], pool['1'], hermitian=False)
     OpArgMngr.add_workload("linalg.svd", pool['3x3'])
     OpArgMngr.add_workload("linalg.cholesky", pool['1x1'])
     OpArgMngr.add_workload("linalg.eigvals", pool['1x1'])
@@ -124,10 +125,10 @@ def prepare_workloads():
                            out=dnp.array([False, False], dtype=bool), keepdims=False)
     OpArgMngr.add_workload("roll", pool["2x2"], 1, axis=0)
     OpArgMngr.add_workload("rot90", pool["2x2"], 2)
-    OpArgMngr.add_workload("array_split", pool['2X2'], 2, axis=1)
-    OpArgMngr.add_workload("vsplit", pool['2X2'], 2)
-    OpArgMngr.add_workload("hsplit", pool['2X2'], 2)
-    OpArgMngr.add_workload("dsplit", pool['2X2x2'], 2)
+    OpArgMngr.add_workload("array_split", pool['2x2'], 2, axis=1)
+    OpArgMngr.add_workload("vsplit", pool['2x2'], 2)
+    OpArgMngr.add_workload("hsplit", pool['2x2'], 2)
+    OpArgMngr.add_workload("dsplit", pool['2x2x2'], 2)
     OpArgMngr.add_workload("arange", 10)
     OpArgMngr.add_workload("concatenate", (pool['1x2'], pool['1x2'], pool['1x2']), axis=0)
     OpArgMngr.add_workload("append", pool['2x2'], pool['1x2'], axis=0)
diff --git a/python/mxnet/ndarray/numpy/linalg.py b/python/mxnet/ndarray/numpy/linalg.py
index 7d7d6fc064c7..86bf11a00b02 100644
--- a/python/mxnet/ndarray/numpy/linalg.py
+++ b/python/mxnet/ndarray/numpy/linalg.py
@@ -23,7 +23,52 @@
 from . import _api_internal
 
 __all__ = ['norm', 'svd', 'cholesky', 'qr', 'inv', 'det', 'slogdet', 'solve', 'tensorinv', 'tensorsolve',
-           'pinv', 'eigvals', 'eig', 'eigvalsh', 'eigh', 'lstsq']
+           'pinv', 'eigvals', 'eig', 'eigvalsh', 'eigh', 'lstsq', 'matrix_rank']
+
+
+def matrix_rank(M, tol=None, hermitian=False):
+    """
+    Return matrix rank of array using SVD method
+
+    Rank of the array is the number of singular values of the array that are
+    greater than `tol`.
+
+    Parameters
+    M : {(M,), (..., M, N)} ndarray
+        Input vector or stack of matrices.
+    tol : (...) ndarray, float, optional
+        Threshold below which SVD values are considered zero. If `tol` is
+        None, and ``S`` is an array with singular values for `M`, and
+        ``eps`` is the epsilon value for datatype of ``S``, then `tol` is
+        set to ``S.max() * max(M.shape) * eps``.
+    hermitian : bool, optional
+        If True, `M` is assumed to be Hermitian (symmetric if real-valued),
+        enabling a more efficient method for finding singular values.
+        Defaults to False.
+
+    Returns
+    -------
+    rank : (...) ndarray
+        Rank of M.
+
+    Examples
+    --------
+    >>> from mxnet import np
+    >>> np.matrix_rank(np.eye(4)) # Full rank matrix
+    4
+    >>> I=np.eye(4); I[-1,-1] = 0. # rank deficient matrix
+    >>> np.matrix_rank(I)
+    3
+    >>> np.matrix_rank(np.ones((4,))) # 1 dimension - rank 1 unless all 0
+    1
+    >>> np.matrix_rank(np.zeros((4,)))
+    0
+    """
+    finfo_eps_32 = _np.finfo(_np.float32).eps
+    finfo_eps_64 = _np.finfo(_np.float64).eps
+    if hermitian is True:
+        raise NotImplementedError("hermitian is not supported yet...")
+    return _api_internal.matrix_rank(M, tol, hermitian, finfo_eps_32, finfo_eps_64)
 
 
 def lstsq(a, b, rcond='warn'):
diff --git a/python/mxnet/numpy/fallback_linalg.py b/python/mxnet/numpy/fallback_linalg.py
index 5e06ff94a4ce..79d6b83062ec 100644
--- a/python/mxnet/numpy/fallback_linalg.py
+++ b/python/mxnet/numpy/fallback_linalg.py
@@ -24,11 +24,9 @@
 __all__ = [
     'cond',
     'matrix_power',
-    'matrix_rank',
     'multi_dot'
 ]
 
 cond = onp.linalg.cond
 matrix_power = onp.linalg.matrix_power
-matrix_rank = onp.linalg.matrix_rank
 multi_dot = onp.linalg.multi_dot
diff --git a/python/mxnet/numpy/linalg.py b/python/mxnet/numpy/linalg.py
index 445adfdeaeae..d2756d531b7c 100644
--- a/python/mxnet/numpy/linalg.py
+++ b/python/mxnet/numpy/linalg.py
@@ -22,10 +22,51 @@
 from . import fallback_linalg
 
 __all__ = ['norm', 'svd', 'cholesky', 'qr', 'inv', 'det', 'slogdet', 'solve', 'tensorinv', 'tensorsolve',
-           'pinv', 'eigvals', 'eig', 'eigvalsh', 'eigh', 'lstsq']
+           'pinv', 'eigvals', 'eig', 'eigvalsh', 'eigh', 'lstsq', 'matrix_rank']
 __all__ += fallback_linalg.__all__
 
 
+def matrix_rank(M, tol=None, hermitian=False):
+    """
+    Return matrix rank of array using SVD method
+
+    Rank of the array is the number of singular values of the array that are
+    greater than `tol`.
+
+    Parameters
+    M : {(M,), (..., M, N)} ndarray
+        Input vector or stack of matrices.
+    tol : (...) ndarray, float, optional
+        Threshold below which SVD values are considered zero. If `tol` is
+        None, and ``S`` is an array with singular values for `M`, and
+        ``eps`` is the epsilon value for datatype of ``S``, then `tol` is
+        set to ``S.max() * max(M.shape) * eps``.
+    hermitian : bool, optional
+        If True, `M` is assumed to be Hermitian (symmetric if real-valued),
+        enabling a more efficient method for finding singular values.
+        Defaults to False.
+
+    Returns
+    -------
+    rank : (...) ndarray
+        Rank of M.
+
+    Examples
+    --------
+    >>> from mxnet import np
+    >>> np.matrix_rank(np.eye(4)) # Full rank matrix
+    4
+    >>> I=np.eye(4); I[-1,-1] = 0. # rank deficient matrix
+    >>> np.matrix_rank(I)
+    3
+    >>> np.matrix_rank(np.ones((4,))) # 1 dimension - rank 1 unless all 0
+    1
+    >>> np.matrix_rank(np.zeros((4,)))
+    0
+    """
+    return _mx_nd_np.linalg.matrix_rank(M, tol, hermitian)
+
+
 def lstsq(a, b, rcond='warn'):
     r"""
     Return the least-squares solution to a linear matrix equation.
diff --git a/python/mxnet/numpy_dispatch_protocol.py b/python/mxnet/numpy_dispatch_protocol.py
index 9d973e492f04..e693a00ea1a5 100644
--- a/python/mxnet/numpy_dispatch_protocol.py
+++ b/python/mxnet/numpy_dispatch_protocol.py
@@ -166,6 +166,7 @@ def _run_with_array_ufunc_proto(*args, **kwargs):
     'linalg.eigvalsh',
     'linalg.eigh',
     'linalg.qr',
+    'linalg.matrix_rank',
     'shape',
     'trace',
     'tril',
diff --git a/python/mxnet/symbol/numpy/linalg.py b/python/mxnet/symbol/numpy/linalg.py
index 81740dd0ac2e..da7095520674 100644
--- a/python/mxnet/symbol/numpy/linalg.py
+++ b/python/mxnet/symbol/numpy/linalg.py
@@ -23,7 +23,40 @@
 from . import _internal as _npi
 
 __all__ = ['norm', 'svd', 'cholesky', 'qr', 'inv', 'det', 'slogdet', 'solve', 'tensorinv', 'tensorsolve',
-           'pinv', 'eigvals', 'eig', 'eigvalsh', 'eigh', 'lstsq']
+           'pinv', 'eigvals', 'eig', 'eigvalsh', 'eigh', 'lstsq', 'matrix_rank']
+
+
+def matrix_rank(M, tol=None, hermitian=False):
+    """
+    Return matrix rank of array using SVD method
+
+    Rank of the array is the number of singular values of the array that are
+    greater than `tol`.
+
+    Parameters
+    M : {(M,), (..., M, N)} _Symbol
+        Input vector or stack of matrices.
+    tol : (...) _Symbol, float, optional
+        Threshold below which SVD values are considered zero. If `tol` is
+        None, and ``S`` is an array with singular values for `M`, and
+        ``eps`` is the epsilon value for datatype of ``S``, then `tol` is
+        set to ``S.max() * max(M.shape) * eps``.
+    hermitian : bool, optional
+        If True, `M` is assumed to be Hermitian (symmetric if real-valued),
+        enabling a more efficient method for finding singular values.
+        Defaults to False.
+
+    Returns
+    -------
+    rank : (...) _Symbol
+        Rank of M.
+    """
+    finfo_eps_32 = _np.finfo(_np.float32).eps
+    finfo_eps_64 = _np.finfo(_np.float64).eps
+    if tol is None:
+        return _npi.matrix_rank_none_tol(M, finfo_eps_32, finfo_eps_64, hermitian)
+    else:
+        return _npi.matrix_rank(M, tol, hermitian)
 
 
 def lstsq(a, b, rcond='warn'):
diff --git a/src/api/operator/numpy/linalg/np_matrix_rank.cc b/src/api/operator/numpy/linalg/np_matrix_rank.cc
new file mode 100644
index 000000000000..4bfe66664ef8
--- /dev/null
+++ b/src/api/operator/numpy/linalg/np_matrix_rank.cc
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_pinv.cc
+ * \brief Implementation of the API of functions in src/operator/numpy/linalg/np_matrix_rank.cc
+ */
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include "../../utils.h"
+#include "../../../../operator/numpy/linalg/np_matrix_rank-inl.h"
+
+namespace mxnet {
+
+inline static void _npi_matrix_rank_none_tol(runtime::MXNetArgs args,
+                                             runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_matrix_rank_none_tol");
+  op::MatrixRankNoneTolParam param;
+  nnvm::NodeAttrs attrs;
+  param.hermitian = args[2].operator bool();
+  param.finfoEps32 = args[3].operator double();
+  param.finfoEps64 = args[4].operator double();
+  attrs.parsed = param;
+  attrs.op = op;
+  SetAttrDict<op::MatrixRankNoneTolParam>(&attrs);
+  int num_inputs = 1;
+  int num_outputs = 0;
+  NDArray* inputs[] = {args[0].operator mxnet::NDArray*()};
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs, &num_outputs, nullptr);
+  *ret = reinterpret_cast<mxnet::NDArray*>(ndoutputs[0]);
+}
+
+inline static void _npi_matrix_rank(runtime::MXNetArgs args,
+                                    runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_matrix_rank");
+  op::MatrixRankParam param;
+  nnvm::NodeAttrs attrs;
+  param.hermitian = args[2].operator bool();
+  attrs.parsed = param;
+  attrs.op = op;
+  SetAttrDict<op::MatrixRankParam>(&attrs);
+  int num_inputs = 2;
+  int num_outputs = 0;
+  NDArray* inputs[] = {args[0].operator mxnet::NDArray*(), args[1].operator mxnet::NDArray*()};
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs, &num_outputs, nullptr);
+  *ret = reinterpret_cast<mxnet::NDArray*>(ndoutputs[0]);
+}
+
+MXNET_REGISTER_API("_npi.matrix_rank")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  if (args[1].type_code() == kNull) {
+    _npi_matrix_rank_none_tol(args, ret);
+  } else {
+    _npi_matrix_rank(args, ret);
+  }
+});
+
+}  // namespace mxnet
diff --git a/src/operator/numpy/linalg/np_matrix_rank-inl.h b/src/operator/numpy/linalg/np_matrix_rank-inl.h
new file mode 100644
index 000000000000..8ccecb57db11
--- /dev/null
+++ b/src/operator/numpy/linalg/np_matrix_rank-inl.h
@@ -0,0 +1,449 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file np_matrix_rank-inl.h
+ * \brief Placeholder for matrix_rank
+ */
+#ifndef MXNET_OPERATOR_NUMPY_LINALG_NP_MATRIX_RANK_INL_H_
+#define MXNET_OPERATOR_NUMPY_LINALG_NP_MATRIX_RANK_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <string>
+#include <utility>
+#include <algorithm>
+#include "../../operator_common.h"
+#include "../../mshadow_op.h"
+#include "./np_pinv-inl.h"
+
+namespace mxnet {
+namespace op {
+
+using namespace mshadow;
+
+struct MatrixRankNoneTolParam : public dmlc::Parameter<MatrixRankNoneTolParam> {
+  float finfoEps32;
+  double finfoEps64;
+  bool hermitian;
+  DMLC_DECLARE_PARAMETER(MatrixRankNoneTolParam) {
+    DMLC_DECLARE_FIELD(finfoEps32)
+    .set_default(0)
+    .describe("Machine limits for float32 type");
+    DMLC_DECLARE_FIELD(finfoEps64)
+    .set_default(0)
+    .describe("Machine limits for float64 type");
+    DMLC_DECLARE_FIELD(hermitian)
+    .set_default(false)
+    .describe("If True, M is assumed to be Hermitian (symmetric if real-valued).");
+  }
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream finfoEps32_s, finfoEps64_s, hermitian_s;
+    finfoEps32_s << finfoEps32;
+    finfoEps64_s << finfoEps64;
+    hermitian_s << hermitian;
+    (*dict)["finfoEps32"] = finfoEps32_s.str();
+    (*dict)["finfoEps64"] = finfoEps64_s.str();
+    (*dict)["hermitian"] = hermitian_s.str();
+  }
+};
+
+struct MatrixRankParam : public dmlc::Parameter<MatrixRankParam> {
+  bool hermitian;
+  DMLC_DECLARE_PARAMETER(MatrixRankParam) {
+    DMLC_DECLARE_FIELD(hermitian)
+    .set_default(false)
+    .describe("If True, M is assumed to be Hermitian (symmetric if real-valued).");
+  }
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream hermitian_s;
+    hermitian_s << hermitian;
+    (*dict)["hermitian"] = hermitian_s.str();
+  }
+};
+
+template<int req>
+struct VectorRankKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, const DType *in_data,
+                                  int64_t *out_data, const int& data_size) {
+    bool all_nozero = true;
+    for (int j = 0; j < data_size; ++j) {
+      if (!((in_data[j] > 0 ? in_data[j] : -in_data[j]) > 0)) {
+        all_nozero = false;
+        break;
+      }
+    }
+    KERNEL_ASSIGN(*out_data, req, static_cast<int64_t>(all_nozero ? 1 : 0));
+  }
+};
+
+template<int req>
+struct MatrixRankNoneTolKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, const DType *in_data, int64_t *out_data,
+                                  const int& nrow, const int& ncol, const double& finfoEps,
+                                  const int& data_size, const int& batch_size) {
+    if (i < batch_size) {
+      DType max_singular_value = 0;
+      for (int j = 0; j < data_size; ++j) {
+        DType sv = in_data[j + i * data_size];
+        max_singular_value = sv > max_singular_value ? sv : max_singular_value;
+      }
+      double tol = (nrow > ncol ? nrow : ncol) * static_cast<double>(max_singular_value) * finfoEps;
+      int64_t rank_num = 0;
+      for (int j = 0; j < data_size; ++j) {
+        rank_num += in_data[j + i * data_size] > tol ? 1 : 0;
+      }
+      KERNEL_ASSIGN(out_data[i], req, rank_num);
+    }
+  }
+};
+
+template<int req>
+struct MatrixRankKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, const DType *in_data, int64_t *out_data,
+                                  const int& data_size, const int& batch_size) {
+    if (i < batch_size) {
+      int64_t rank_num = 0;
+      for (int j = 0; j < data_size; ++j) {
+        rank_num += in_data[j + i * data_size] > 0 ? 1 : 0;
+      }
+      KERNEL_ASSIGN(out_data[i], req, rank_num);
+    }
+  }
+};
+
+struct SVDWrapper {
+  template<typename xpu, typename DType>
+  static void op(const TBlob& a, const TBlob& s,
+                 const TBlob& u, const mxnet::TShape& ut_shape,
+                 const TBlob& v, const mxnet::TShape& vt_shape,
+                 const TBlob& work, const OpContext& ctx) {
+    Stream<xpu> *s_xpu = ctx.get_stream<xpu>();
+    const mxnet::TShape& a_shape = a.shape_;
+    const mxnet::TShape& ut_axis = GetTransAxis(u.shape_);
+    const int a_ndim = a.ndim();
+    const int nrow = a_shape[a_ndim - 2];
+    const int ncol = a_shape[a_ndim - 1];
+    if (nrow > ncol) {
+      const_cast<TBlob&>(u) = u.reshape(ut_shape);
+      const_cast<TBlob&>(v) = v.reshape(vt_shape);
+      mxnet::op::TransposeImpl<xpu>(ctx.run_ctx, a, u, ut_axis);
+      BatchSVDImpl(ncol, nrow,
+                   v.FlatToKD<xpu, 3, DType>(s_xpu),
+                   s.FlatToKD<xpu, 2, DType>(s_xpu),
+                   u.FlatToKD<xpu, 3, DType>(s_xpu),
+                   work.FlatToKD<xpu, 1, DType>(s_xpu), s_xpu);
+    } else {
+      if (a.dptr<DType>() != v.dptr<DType>()) {
+        Copy(v.FlatToKD<xpu, 3, DType>(s_xpu), a.FlatToKD<xpu, 3, DType>(s_xpu), s_xpu);
+      }
+      BatchSVDImpl(nrow, ncol,
+                   u.FlatToKD<xpu, 3, DType>(s_xpu),
+                   s.FlatToKD<xpu, 2, DType>(s_xpu),
+                   v.FlatToKD<xpu, 3, DType>(s_xpu),
+                   work.FlatToKD<xpu, 1, DType>(s_xpu), s_xpu);
+    }
+  }
+};
+
+inline void GetOrCheckBroadcastShape(const nnvm::NodeAttrs& attrs,
+                                     const mxnet::TShape& a_shape,
+                                     const mxnet::TShape& tol_shape,
+                                     mxnet::TShape *broadcast_shape = nullptr,
+                                     mxnet::TShape *new_tol_shape = nullptr) {
+  CHECK_GE(a_shape.ndim(), 2);
+  const int a_ndim = a_shape.ndim();
+  const int tol_ndim = tol_shape.ndim();
+  const int nrow = a_shape[a_ndim - 2];
+  const int ncol = a_shape[a_ndim - 1];
+  // Get new tol shape.
+  mxnet::TShape temp_new_tol_shape(tol_ndim + 1, 1);
+  for (int i = 0; i < tol_ndim; ++i) { temp_new_tol_shape[i] = tol_shape[i]; }
+  // Get singular value shape.
+  mxnet::TShape temp_s_shape(a_ndim - 1, 0);
+  for (int i = 0; i < a_ndim - 2; ++i) {
+    temp_s_shape[i] = a_shape[i];
+  }
+  temp_s_shape[a_ndim - 2] = std::min(nrow, ncol);
+  // Check binary broadcast shape.
+  mxnet::ShapeVector in_shape_vec({ temp_s_shape, temp_new_tol_shape });
+  mxnet::ShapeVector out_shape_vec(1, mxnet::TShape());
+  mxnet::op::BinaryBroadcastShape(attrs, &in_shape_vec, &out_shape_vec);
+  // Assign shape.
+  if (broadcast_shape) {
+    *broadcast_shape = out_shape_vec[0];
+  }
+  if (new_tol_shape) {
+    *new_tol_shape = temp_new_tol_shape;
+  }
+}
+
+template<typename xpu, typename DType>
+struct WSQ {
+  static size_t SVDWorkspaceSizeQuery(const TBlob& a,
+                                      const mxnet::TShape& u_shape,
+                                      const mxnet::TShape& s_shape,
+                                      const mxnet::TShape& v_shape,
+                                      const OpContext& ctx) {
+    size_t workspace_size = 0;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const int a_ndim = a.shape_.ndim();
+    const int u_ndim = u_shape.ndim();
+    const int s_ndim = s_shape.ndim();
+    const int v_ndim = v_shape.ndim();
+    mxnet::TShape u_shape2 = Shape2(u_shape[u_ndim - 2], u_shape[u_ndim - 1]);
+    mxnet::TShape s_shape1 = Shape1(s_shape[s_ndim - 1]);
+    mxnet::TShape v_shape2 = Shape2(v_shape[v_ndim - 2], v_shape[v_ndim - 1]);
+    if (xpu::kDevCPU) {
+      std::vector<DType> u_vec(u_shape2.Size(), 0);
+      std::vector<DType> s_vec(s_shape1.Size(), 0);
+      std::vector<DType> v_vec(v_shape2.Size(), 0);
+      // Get workspace size in linalg_gesdd.
+      workspace_size += linalg_gesdd_workspace_query(
+        a.shape_[a_ndim - 2], a.shape_[a_ndim - 1],
+        TBlob(u_vec.data(), u_shape2, a.dev_mask(), a.dev_id()).get<xpu, 2, DType>(s),
+        TBlob(s_vec.data(), s_shape1, a.dev_mask(), a.dev_id()).get<xpu, 1, DType>(s),
+        TBlob(v_vec.data(), v_shape2, a.dev_mask(), a.dev_id()).get<xpu, 2, DType>(s), s);
+    } else {
+      Storage::Handle u_handle =
+        Storage::Get()->Alloc(sizeof(DType) * u_shape2.Size(), Context::GPU());
+      Storage::Handle s_handle =
+        Storage::Get()->Alloc(sizeof(DType) * s_shape1.Size(), Context::GPU());
+      Storage::Handle v_handle =
+        Storage::Get()->Alloc(sizeof(DType) * v_shape2.Size(), Context::GPU());
+      TBlob u_data(static_cast<DType*>(u_handle.dptr), u_shape2, a.dev_mask(), a.dev_id());
+      TBlob s_data(static_cast<DType*>(s_handle.dptr), s_shape1, a.dev_mask(), a.dev_id());
+      TBlob v_data(static_cast<DType*>(v_handle.dptr), v_shape2, a.dev_mask(), a.dev_id());
+      // Get workspace size in linalg_gesvd.
+      if (a.shape_[a_ndim - 2] >= a.shape_[a_ndim - 1]) {
+        workspace_size += linalg_gesvd_workspace_query(v_data.get<xpu, 2, DType>(s),
+                                                       s_data.get<xpu, 1, DType>(s),
+                                                       u_data.get<xpu, 2, DType>(s), s);
+      } else {
+        workspace_size += linalg_gesvd_workspace_query(u_data.get<xpu, 2, DType>(s),
+                                                       s_data.get<xpu, 1, DType>(s),
+                                                       v_data.get<xpu, 2, DType>(s), s);
+      }
+      Storage::Get()->Free(u_handle);
+      Storage::Get()->Free(s_handle);
+      Storage::Get()->Free(v_handle);
+    }
+    return workspace_size;
+  }
+
+  static size_t MatrixRankNoneTolForwardWSQ(size_t *svd_workspace_size,
+                                            const TBlob& a,
+                                            const OpContext& ctx) {
+    size_t workspace_size = 0;
+    mxnet::TShape u_shape, s_shape, v_shape;
+    GetPinvShape(a.shape_, &u_shape, &s_shape, &v_shape);
+    *svd_workspace_size = SVDWorkspaceSizeQuery(a, u_shape, s_shape, v_shape, ctx);
+    workspace_size += *svd_workspace_size;  // For #gesdd_ or #gesvd work space.
+    workspace_size += u_shape.Size();       // For UT.
+    workspace_size += s_shape.Size();       // For S.
+    workspace_size += v_shape.Size();       // For V.
+    return workspace_size * sizeof(DType);
+  }
+
+  static size_t MatrixRankForwardWSQ(size_t *svd_workspace_size,
+                                     const TBlob& a,
+                                     const TBlob& tol,
+                                     const nnvm::NodeAttrs& attrs,
+                                     const OpContext& ctx) {
+    const mxnet::TShape a_shape = a.shape_;
+    const mxnet::TShape tol_shape = tol.shape_;
+    size_t workspace_size = 0;
+    mxnet::TShape u_shape, s_shape, v_shape;
+    GetPinvShape(a.shape_, &u_shape, &s_shape, &v_shape);
+    mxnet::TShape broadcast_shape, new_tol_shape;
+    GetOrCheckBroadcastShape(attrs, a_shape, tol_shape, &broadcast_shape, &new_tol_shape);
+    *svd_workspace_size = SVDWorkspaceSizeQuery(a, u_shape, s_shape, v_shape, ctx);
+    workspace_size += *svd_workspace_size;     // For #gesdd_ or #gesvd work space.
+    workspace_size += u_shape.Size();          // For UT.
+    workspace_size += s_shape.Size();          // For S.
+    workspace_size += v_shape.Size();          // For V.
+    workspace_size += new_tol_shape.Size();    // For tol with newaxis.
+    workspace_size += broadcast_shape.Size();  // For binary broadcast shape.
+    return workspace_size * sizeof(DType);
+  }
+};
+
+template<typename xpu>
+void MatrixRankNoneTolForwardImpl(const TBlob& a,
+                                  const TBlob& rank,
+                                  const nnvm::NodeAttrs& attrs,
+                                  const OpContext& ctx,
+                                  const std::vector<OpReqType>& req) {
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const mxnet::TShape& a_shape = a.shape_;
+  const int a_ndim = a.ndim();
+  MSHADOW_SGL_DBL_TYPE_SWITCH(a.type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      if (a_ndim < 2) {
+        mxnet_op::Kernel<VectorRankKernel<req_type>, xpu>::Launch(
+          s, 1, a.dptr<DType>(), rank.dptr<int64_t>(), a.Size());
+        return;
+      }
+      // a_ndim >= 2
+      const int nrow = a_shape[a_ndim - 2];
+      const int ncol = a_shape[a_ndim - 1];
+      const MatrixRankNoneTolParam& param = nnvm::get<MatrixRankNoneTolParam>(attrs.parsed);
+      CHECK_EQ(param.hermitian, false)
+        << "matrix_rank not support param.hermitian = true at present.";
+      double finfoEps = a.type_flag_ == mshadow::kFloat32 ? param.finfoEps32 : param.finfoEps64;
+      // Step1: Calculate workspace size.
+      size_t svd_workspace_size = 0;
+      size_t workspace_size =
+        WSQ<xpu, DType>::MatrixRankNoneTolForwardWSQ(&svd_workspace_size, a, ctx);
+      Tensor<xpu, 1, char> workspace =
+        ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
+      // Step2: Allocate memory.
+      mxnet::TShape s_shape, u_shape, v_shape, ut_shape, vt_shape;
+      GetPinvShape(a_shape, &u_shape, &s_shape, &v_shape, &ut_shape, &vt_shape);
+      DType *s_ptr = reinterpret_cast<DType*>(workspace.dptr_);
+      DType *u_ptr = s_ptr + s_shape.Size();
+      DType *v_ptr = u_ptr + u_shape.Size();
+      DType *work_ptr = v_ptr + v_shape.Size();
+      TBlob s_data(s_ptr, s_shape, a.dev_mask(), a.dev_id());
+      TBlob u_data(u_ptr, u_shape, a.dev_mask(), a.dev_id());
+      TBlob v_data(v_ptr, v_shape, a.dev_mask(), a.dev_id());
+      TBlob work_data(work_ptr, Shape1(svd_workspace_size), a.dev_mask(), a.dev_id());
+      // Step3: SVD.
+      SVDWrapper::op<xpu, DType>(a, s_data, u_data, ut_shape, v_data, vt_shape, work_data, ctx);
+      // Step4: Calculate rank.
+      const int data_size = s_data.size(s_data.ndim() - 1);
+      const int batch_size = a_ndim == 2 ? 1 : s_shape.ProdShape(0, s_shape.ndim() - 1);
+      mxnet_op::Kernel<MatrixRankNoneTolKernel<req_type>, xpu>::Launch(s, batch_size,
+                                                                       s_data.dptr<DType>(),
+                                                                       rank.dptr<int64_t>(),
+                                                                       nrow, ncol, finfoEps,
+                                                                       data_size, batch_size);
+    });
+  });
+}
+
+template<typename xpu>
+void MatrixRankNoneTolForward(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<TBlob>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  if (kNullOp == req[0]) { return; }
+  CHECK(req[0] == kWriteTo || req[0] == kWriteInplace);
+
+  const TBlob& a = inputs[0];
+  const TBlob& rank = outputs[0];
+  MatrixRankNoneTolForwardImpl<xpu>(a, rank, attrs, ctx, req);
+}
+
+template<typename xpu>
+void MatrixRankForwardImpl(const TBlob& a,
+                           const TBlob& tol,
+                           const TBlob& rank,
+                           const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<OpReqType>& req) {
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const mxnet::TShape& a_shape = a.shape_;
+  const mxnet::TShape& tol_shape = tol.shape_;
+  const int a_ndim = a.ndim();
+  MSHADOW_SGL_DBL_TYPE_SWITCH(a.type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      if (a_ndim < 2) {
+        mxnet_op::Kernel<VectorRankKernel<req_type>, xpu>::Launch(
+          s, 1, a.dptr<DType>(), rank.dptr<int64_t>(), a.Size());
+        return;
+      }
+      // a_ndim >= 2
+      const MatrixRankParam& param = nnvm::get<MatrixRankParam>(attrs.parsed);
+      CHECK_EQ(param.hermitian, false)
+        << "matrix_rank not support param.hermitian = true at present.";
+      mxnet::TShape s_shape, u_shape, v_shape, ut_shape, vt_shape;
+      GetPinvShape(a_shape, &u_shape, &s_shape, &v_shape, &ut_shape, &vt_shape);
+      mxnet::TShape broadcast_shape, new_tol_shape;
+      GetOrCheckBroadcastShape(attrs, a_shape, tol_shape, &broadcast_shape, &new_tol_shape);
+      // Step1: Calculate workspace size.
+      size_t svd_workspace_size = 0;
+      size_t workspace_size =
+        WSQ<xpu, DType>::MatrixRankForwardWSQ(&svd_workspace_size, a, tol, attrs, ctx);
+      Tensor<xpu, 1, char> workspace =
+        ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
+      // Step2: Allocate memory.
+      DType *s_ptr = reinterpret_cast<DType*>(workspace.dptr_);
+      DType *u_ptr = s_ptr + s_shape.Size();
+      DType *v_ptr = u_ptr + u_shape.Size();
+      DType *work_ptr = v_ptr + v_shape.Size();
+      DType *new_tol_ptr = work_ptr + svd_workspace_size;
+      DType *broadcast_ptr = new_tol_ptr + new_tol_shape.Size();
+      TBlob s_data(s_ptr, s_shape, a.dev_mask(), a.dev_id());
+      TBlob u_data(u_ptr, u_shape, a.dev_mask(), a.dev_id());
+      TBlob v_data(v_ptr, v_shape, a.dev_mask(), a.dev_id());
+      TBlob work_data(work_ptr, Shape1(svd_workspace_size), a.dev_mask(), a.dev_id());
+      TBlob new_tol_data(new_tol_ptr, new_tol_shape, a.dev_mask(), a.dev_id());
+      TBlob broadcast_data(broadcast_ptr, broadcast_shape, a.dev_mask(), a.dev_id());
+      // Step3: SVD.
+      SVDWrapper::op<xpu, DType>(a, s_data, u_data, ut_shape, v_data, vt_shape, work_data, ctx);
+      // Step4: Calculate broadcast data.
+      if (new_tol_data.dptr<DType>() != tol.dptr<DType>()) {
+        Copy(new_tol_data.FlatTo1D<xpu, DType>(s), tol.FlatTo1D<xpu, DType>(s), s);
+      }
+      mxnet::op::BinaryBroadcastCompute<xpu, op::mshadow_op::gt>(attrs, ctx,
+                                                                 {s_data, new_tol_data},
+                                                                 {kWriteTo}, {broadcast_data});
+      // Step5: Calculate rank.
+      const int b_ndim  = broadcast_shape.ndim();
+      const int data_size = broadcast_data.size(b_ndim - 1);
+      const int batch_size = b_ndim == 1 ? 1 : broadcast_shape.ProdShape(0, b_ndim - 1);
+      mxnet_op::Kernel<MatrixRankKernel<req_type>, xpu>::Launch(s, batch_size,
+                                                                broadcast_data.dptr<DType>(),
+                                                                rank.dptr<int64_t>(),
+                                                                data_size, batch_size);
+    });
+  });
+}
+
+template<typename xpu>
+void MatrixRankForward(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx,
+                       const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  if (kNullOp == req[0]) { return; }
+  CHECK(req[0] == kWriteTo || req[0] == kWriteInplace);
+
+  const TBlob& a = inputs[0];
+  const TBlob& tol = inputs[1];
+  const TBlob& rank = outputs[0];
+  MatrixRankForwardImpl<xpu>(a, tol, rank, attrs, ctx, req);
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NUMPY_LINALG_NP_MATRIX_RANK_INL_H_
diff --git a/src/operator/numpy/linalg/np_matrix_rank.cc b/src/operator/numpy/linalg/np_matrix_rank.cc
new file mode 100644
index 000000000000..d3794a1de0e9
--- /dev/null
+++ b/src/operator/numpy/linalg/np_matrix_rank.cc
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file np_matrix_rank.cc
+ * \brief CPU implementation of the matrix_rank Operator
+ */
+#include "./np_matrix_rank-inl.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool MatrixRankNoneTolShape(const nnvm::NodeAttrs& attrs,
+                                   mxnet::ShapeVector *in_attrs,
+                                   mxnet::ShapeVector *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const mxnet::TShape& a_shape = (*in_attrs)[0];
+  const int a_ndim = a_shape.ndim();
+
+  if (shape_is_known(a_shape)) {
+    CHECK_GT(a_shape.Size(), 0U)
+      << "Not support zero-size input array which has no identity";
+    if (a_ndim < 2) {
+      SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(0, 0));
+    } else {
+      mxnet::TShape rank_shape(a_ndim - 2, 0);
+      for (int i = 0; i < a_ndim - 2; ++i) { rank_shape[i] = a_shape[i]; }
+      SHAPE_ASSIGN_CHECK(*out_attrs, 0, rank_shape);
+    }
+  }
+  return shape_is_known(*in_attrs) && shape_is_known(*out_attrs);
+}
+
+inline bool MatrixRankNoneTolType(const nnvm::NodeAttrs& attrs,
+                                  std::vector<int>* in_attrs,
+                                  std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  int a_type = in_attrs->at(0);
+
+  CHECK_NE(a_type, mshadow::kFloat16)
+    << "array type float16 is unsupported in linalg.";
+  CHECK(a_type == mshadow::kFloat32 || a_type == mshadow::kFloat64)
+    << "array type should be float32 or float64.";
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt64);
+  return out_attrs->at(0) != -1;
+}
+
+DMLC_REGISTER_PARAMETER(MatrixRankNoneTolParam);
+
+NNVM_REGISTER_OP(_npi_matrix_rank_none_tol)
+.describe(R"code()code" ADD_FILELINE)
+.set_attr_parser(mxnet::op::ParamParser<MatrixRankNoneTolParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs){
+  return std::vector<std::string>{"M"};
+})
+.set_attr<mxnet::FInferShape>("FInferShape", MatrixRankNoneTolShape)
+.set_attr<nnvm::FInferType>("FInferType", MatrixRankNoneTolType)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs){
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<FCompute>("FCompute<cpu>", MatrixRankNoneTolForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("M", "NDArray-or-Symbol", "Tensor of matrix")
+.add_arguments(MatrixRankNoneTolParam::__FIELDS__());
+
+inline bool MatrixRankShape(const nnvm::NodeAttrs& attrs,
+                            mxnet::ShapeVector *in_attrs,
+                            mxnet::ShapeVector *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const mxnet::TShape& a_shape = (*in_attrs)[0];
+  const mxnet::TShape& tol_shape = (*in_attrs)[1];
+  const int a_ndim = a_shape.ndim();
+  const int tol_ndim = tol_shape.ndim();
+
+  if (shape_is_known(a_shape) && shape_is_known(tol_shape)) {
+    CHECK_GT(a_shape.Size(), 0U)
+      << "Not support zero-size input array which has no identity";
+    if (a_ndim < 2) {
+      SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(0, 0));
+    } else {
+      mxnet::TShape broadcast_shape;
+      GetOrCheckBroadcastShape(attrs, a_shape, tol_shape, &broadcast_shape);
+      if (broadcast_shape.ndim() == 1) {
+        if (tol_ndim == 0) {
+          SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(0, 0));
+        } else {
+          SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, 1));
+        }
+      } else {
+        mxnet::TShape rank_shape(broadcast_shape.ndim() - 1, 0);
+        for (int i = 0; i < broadcast_shape.ndim() - 1; ++i) {
+          rank_shape[i] = broadcast_shape[i];
+        }
+        SHAPE_ASSIGN_CHECK(*out_attrs, 0, rank_shape);
+      }
+    }
+  }
+  return shape_is_known(*in_attrs) && shape_is_known(*out_attrs);
+}
+
+inline bool MatrixRankType(const nnvm::NodeAttrs& attrs,
+                           std::vector<int>* in_attrs,
+                           std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  int a_type = in_attrs->at(0);
+  int tol_type = in_attrs->at(1);
+
+  CHECK_NE(a_type, mshadow::kFloat16)
+    << "array type float16 is unsupported in linalg.";
+  CHECK(a_type == mshadow::kFloat32 || a_type == mshadow::kFloat64)
+    << "array type should be float32 or float64.";
+  CHECK(tol_type == mshadow::kFloat32 || tol_type == mshadow::kFloat64)
+    << "tol type should be float32 or float64.";
+  CHECK_EQ(a_type, tol_type)
+    << "array type and tol type should be the same.";
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt64);
+  return out_attrs->at(0) != -1;
+}
+
+DMLC_REGISTER_PARAMETER(MatrixRankParam);
+
+NNVM_REGISTER_OP(_npi_matrix_rank)
+.describe(R"code()code" ADD_FILELINE)
+.set_attr_parser(mxnet::op::ParamParser<MatrixRankParam>)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs){
+  return std::vector<std::string>{"M", "tol"};
+})
+.set_attr<mxnet::FInferShape>("FInferShape", MatrixRankShape)
+.set_attr<nnvm::FInferType>("FInferType", MatrixRankType)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs){
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<FCompute>("FCompute<cpu>", MatrixRankForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("M", "NDArray-or-Symbol", "Tensor of matrix")
+.add_argument("tol", "NDArray-or-Symbol", "Tensor of matrix")
+.add_arguments(MatrixRankParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/numpy/linalg/np_matrix_rank.cu b/src/operator/numpy/linalg/np_matrix_rank.cu
new file mode 100644
index 000000000000..9528f698d35c
--- /dev/null
+++ b/src/operator/numpy/linalg/np_matrix_rank.cu
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file np_matrix_rank.cu
+ * \brief GPU implementation of the matrix_rank Operator
+ */
+#include <mxnet/operator_util.h>
+#include "./np_matrix_rank-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_npi_matrix_rank_none_tol)
+.set_attr<FCompute>("FCompute<gpu>", MatrixRankNoneTolForward<gpu>);
+
+NNVM_REGISTER_OP(_npi_matrix_rank)
+.set_attr<FCompute>("FCompute<gpu>", MatrixRankForward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 1f795721b820..79a45f5e46f6 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -2113,12 +2113,29 @@ def _add_workload_linalg_matrix_power():
 
 
 def _add_workload_linalg_matrix_rank():
-    a = np.eye(4)
-    b = a; b[-1,-1] = 0
-    c = np.ones((4,))
-    OpArgMngr.add_workload('linalg.matrix_rank', a)
-    OpArgMngr.add_workload('linalg.matrix_rank', b)
-    OpArgMngr.add_workload('linalg.matrix_rank', c)
+    shapes = [
+        ((4, 3), ()),
+        ((4, 3), (1,)),
+        ((4, 3), (2, 3,)),
+        ((2, 1, 1), (1,)),
+        ((2, 3, 3), (2,)),
+        ((2, 3, 1, 1), ()),
+        ((2, 3, 4, 4), (1, 3)),
+        ((2, 3, 4, 5), (2, 3)),
+        ((2, 3, 5, 4), (2, 3)),
+    ]
+    dtypes = (np.float32, np.float64)
+    for dtype in dtypes:
+        for a_shape, tol_shape in shapes:
+            for tol_is_none in [True, False]:
+                a_np = _np.asarray(_np.random.uniform(-10., 10., a_shape))
+                a = np.array(a_np, dtype=dtype)
+                if tol_is_none:
+                    OpArgMngr.add_workload('linalg.matrix_rank', a, None, False)
+                else:
+                    tol_np = _np.random.uniform(10., 20., tol_shape)
+                    tol = np.array(tol_np, dtype=dtype)
+                    OpArgMngr.add_workload('linalg.matrix_rank', a, tol, False)
 
 
 def _add_workload_linalg_multi_dot():
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 063c3b7a58a6..111f0282283e 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -5625,6 +5625,83 @@ def check_lstsq(a_np, b_np, rcond_np, x, residuals, rank, s):
                 check_lstsq(a_np, b_np, rcond, x, residuals, rank, s)
 
 
+@with_seed()
+@use_np
+def test_np_linalg_matrix_rank():
+    class TestMatrixRank(HybridBlock):
+        def __init__(self, hermitian):
+            super(TestMatrixRank, self).__init__()
+            self._hermitian = hermitian
+
+        def hybrid_forward(self, F, M, tol=None):
+            return F.np.linalg.matrix_rank(M, tol, hermitian=self._hermitian)
+
+    def check_matrix_rank(rank, a_np, tol, hermitian):
+        try:
+            rank_expected = _np.linalg.matrix_rank(a_np, tol=tol, hermitian=hermitian)
+        except Exception as e:
+            print("a:", a_np)
+            print("a shape:", a_np.shape)
+            print(e)
+        else:
+            if a_np.ndim < 2:
+                assert rank.shape == _np.asarray(rank_expected).shape
+            else:
+                assert rank.shape == rank_expected.shape
+            assert_almost_equal(rank.asnumpy(), rank_expected, rtol=rtol, atol=atol)
+
+    shapes = [
+        ((), ()),
+        ((1,), (1,)),
+        ((3,), (1,)),
+        ((1, 1), ()),
+        ((1, 1), (1,)),
+        ((3, 3), (1,)),
+        ((3, 4), (1,)),
+        ((4, 3), ()),
+        ((4, 3), (1,)),
+        ((4, 3), (2,)),
+        ((4, 3), (2, 3,)),
+        ((2, 1, 1), ()),
+        ((2, 1, 1), (1,)),
+        ((2, 3, 3), (2,)),
+        ((2, 3, 4), (1,)),
+        ((2, 4, 3), (2,)),
+        ((2, 3, 1, 1), ()),
+        ((2, 3, 1, 1), (1, 1)),
+        ((2, 3, 1, 1), (2, 1)),
+        ((2, 3, 4, 4), (1, 3)),
+        ((2, 3, 4, 5), (2, 1)),
+        ((2, 3, 5, 4), (1, 3)),
+        ((2, 3, 1, 1), (2, 3)),
+        ((2, 3, 4, 4), (2, 3)),
+        ((2, 3, 4, 5), (2, 3)),
+        ((2, 3, 5, 4), (2, 3)),
+    ]
+    dtypes = ['float32', 'float64']
+    for dtype in dtypes:
+        for a_shape, tol_shape in shapes:
+            for tol_is_none, hybridize in itertools.product([True, False], [True, False]):
+                rtol = 1e-3
+                atol = 1e-5
+                test_matrix_rank = TestMatrixRank(hermitian=False)
+                if hybridize:
+                    test_matrix_rank.hybridize()
+
+                a_np = _np.asarray(_np.random.uniform(-10., 10., a_shape))
+                a = np.array(a_np, dtype=dtype)
+                if tol_is_none:
+                    rank = test_matrix_rank(a)
+                    # check matrix_rank validity
+                    check_matrix_rank(rank, a.asnumpy(), tol=None, hermitian=False)
+                else:
+                    tol_np = _np.random.uniform(10., 20., tol_shape)
+                    tol = np.array(tol_np, dtype=dtype)
+                    rank = test_matrix_rank(a, tol)
+                    # check matrix_rank validity
+                    check_matrix_rank(rank, a.asnumpy(), tol.asnumpy(), hermitian=False)
+
+
 @with_seed()
 @use_np
 def test_np_linalg_pinv():